# First step in pipeline
**Load HDBSCAN and t-SNE models and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [15]:
import pickle
import hdbscan
import numpy as np
import pandas as pd
from openTSNE import TSNE
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [9]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('prediction_data.csv')
documents = sentences.transcript_subset.to_list()

In [10]:
# load dim reduced class object
with open('tsne_data.pkl', 'rb') as inp:
    tsne_data = pickle.load(inp)

In [24]:
# load cluster model
with open('hdbscan_model.pkl', 'rb') as inp:
    hdbscan_model = pickle.load(inp)

In [12]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [18]:
# reduce dimensions with PCA and t-SNE
PCA_data = PCA(n_components = 50).fit_transform(embedded_sentences)

tsne_test = tsne_data.transform(PCA_data)

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.14 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 7073.1264, 50 iterations in 19.4517 sec
Iteration  100, KL divergence 6892.8614, 50 iterations in 16.9902 sec
Iteration  150, KL divergence 6838.9759, 50 iterations in 16.8900 sec
Iteration  200, KL divergence 6805.9688, 50 iterations in 16.5368 sec
Iteration  250, KL divergence 6789.3724, 50 iterations in 16.3548 sec
   --> Time elapsed: 86.22 seconds


In [25]:
# get probability vectors for each cluster via soft clustering
probabilities = hdbscan.membership_vector(hdbscan_model, tsne_test)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')

nrows: 468
ncols: 256


In [26]:
# save probabilities to use in segmentation
np.save('probabilities.npy', probabilities)

### Deprecated code 

In [3]:
# load model
topic_model = BERTopic.load('pipeline_BERT')

In [None]:
# get probabilities for each sentence of each topic
_, probabilities = topic_model.transform(documents, tsne_data)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')