# First step in pipeline
**Load topic model and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [6]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer

In [3]:
# load model
topic_model = BERTopic.load('pipeline_BERT')

In [8]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('prediction_data.csv')
documents = sentences.transcript_subset.to_list()

In [9]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [10]:
# reduce dimensions with PCA and t-SNE
PCA = PCA(n_components = 50)
PCA_data = PCA.fit_transform(embedded_sentences)

t_SNE = TSNE(n_components=3)
tsne_data = t_SNE.fit_transform(PCA_data)

In [20]:
# get probabilities for each sentence of each topic
_, probabilities = topic_model.transform(documents, tsne_data)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')

nrows: 329
ncols: 2


In [22]:
# save probabilities to use in segmentation
np.save('probabilities_test.npy', probabilities)