# First step in pipeline
**Load HDBSCAN and t-SNE models and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [1]:
import pickle
import hdbscan
import numpy as np
import pandas as pd
from openTSNE import TSNE
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [2]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('prediction_data.csv')
documents = sentences.transcript_subset.to_list()

In [3]:
# load dim reduced class object
with open('sentence_tsne_data_.pkl', 'rb') as inp:
    tsne_data = pickle.load(inp)

In [213]:
# load cluster model
with open('sentence_hdbscan_model.pkl', 'rb') as inp:
    hdbscan_model = pickle.load(inp)

In [5]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [135]:
# reduce dimensions with PCA and t-SNE
PCA_model = PCA(n_components = 50)
PCA_data = PCA_model.fit_transform(embedded_sentences)
print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA")

tsne_test = tsne_data.transform(PCA_data)

35.88% of the variance has been removed by PCA
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.66 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.02 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 7340.2406, 50 iterations in 31.4019 sec
Iteration  100, KL divergence 7178.4666, 50 iterations in 32.1260 sec
Iteration  150, KL divergence 7119.3947, 50 iterations in 31.5695 sec
Iteration  200, KL divergence 7078.9133, 50 iterations in 29.5806 sec
Iteration  250, KL divergence 7058.9904, 50 iterations in 30.2641 sec
   --> Time elapsed: 154.94 seconds


In [214]:
# get probability vectors for each cluster via soft clustering
probabilities = hdbscan.membership_vector(hdbscan_model, tsne_test)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')

  outlier_vec = outlier_membership_vector(


nrows: 468
ncols: 156


In [215]:
# temp remedy for HDBSCAN problem
inds = np.where(np.isnan(probabilities))
probabilities[inds] = 0
print(f"Number of nan rows: {len(set(list(inds[:][0])))}")

Number of nan rows: 1


In [216]:
# save probabilities to use in segmentation
np.save('probabilities.npy', probabilities)

### Deprecated code 

In [3]:
# load model
topic_model = BERTopic.load('pipeline_BERT')

In [None]:
# get probabilities for each sentence of each topic
_, probabilities = topic_model.transform(documents, tsne_data)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')