# First step in pipeline
**Load topic model and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [11]:
import pickle
import numpy as np
import pandas as pd
from openTSNE import TSNE
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [3]:
# load model
topic_model = BERTopic.load('pipeline_BERT')

In [4]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('prediction_data.csv')
documents = sentences.transcript_subset.to_list()

In [8]:
# load dim reduced class object
with open('tsne_data.pkl', 'rb') as inp:
    tsne_data = pickle.load(inp)

In [5]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [12]:
# reduce dimensions with PCA and t-SNE
PCA = PCA(n_components = 50)
PCA_data = PCA.fit_transform(embedded_sentences)

tsne_test = tsne_data.transform(PCA_data)

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.08 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 3528.0934, 50 iterations in 0.4852 sec
Iteration  100, KL divergence 3361.2787, 50 iterations in 0.4760 sec
Iteration  150, KL divergence 3297.8883, 50 iterations in 0.4635 sec
Iteration  200, KL divergence 3269.3613, 50 iterations in 0.4557 sec
Iteration  250, KL divergence 3255.4906, 50 iterations in 0.4604 sec
   --> Time elapsed: 2.34 seconds


In [20]:
# get probabilities for each sentence of each topic
_, probabilities = topic_model.transform(documents, tsne_data)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')

nrows: 329
ncols: 2


In [22]:
# save probabilities to use in segmentation
np.save('probabilities_test.npy', probabilities)