# First step in pipeline
**Load HDBSCAN and t-SNE models and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [1]:
import os
import pickle
import hdbscan
import numpy as np
import pandas as pd
from openTSNE import TSNE
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [6]:
# load PCA class object
with open('pca_model_50.pkl', 'rb') as inp:
    PCA_model = pickle.load(inp)

In [3]:
# load t-SNE class object
with open('sentence_tsne_data_.pkl', 'rb') as inp:
    tsne_data = pickle.load(inp)

In [4]:
# load cluster model
with open('sentence_hdbscan_model.pkl', 'rb') as inp:
    hdbscan_model = pickle.load(inp)

In [7]:
# loop through all of the annotated transcripts and save their topic probability distribution
directory = '../Thesis/annotated_transcripts_input'

for dirpath, _, files in os.walk(directory):
    for file in files:
        episode_id = file.split('_')[0]
        path = os.path.join(dirpath, file)
        with open(path, errors='replace') as f: 
            prediction_documents = pd.read_csv(f)
            docs = prediction_documents.transcript_subset

            # embedd
            sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
            embedded_sentences = sentence_model.encode(docs)

            # reduce dimensions with PCA and t-SNE
            PCA_data = PCA_model.transform(embedded_sentences)
            print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA for transcript {episode_id}")
            tsne_prediction = tsne_data.transform(PCA_data)

            # get probability vectors for each cluster via soft clustering
            probabilities = hdbscan.membership_vector(hdbscan_model, tsne_prediction)
            print(f'nrows: {len(probabilities)}\nncols: {len(probabilities[0])}')
            # remedy HDBSCAN problem
            inds = np.where(np.isnan(probabilities))
            probabilities[inds] = 0
            print(f"Number of nan rows: {len(set(list(inds[:][0])))} for {episode_id}\n\n")

            # save probabilities to use in downstream segmentation
            name_of_file = f'../Thesis/annotated_probabilities/topic_probability_density_vector_{episode_id}.npy'
            np.save(name_of_file, probabilities)

51.06% of the variance has been removed by PCA for transcript 7A7swZJL0AtFghauiGLadV
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.36 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.04 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 1294.9848, 50 iterations in 31.2757 sec
Iteration  100, KL divergence 1265.0461, 50 iterations in 31.1954 sec
Iteration  150, KL divergence 1250.5674, 50 iterations in 33.2270 sec
Iteration  200, KL divergence 1244.0025, 50 iterations in 32.7587 sec
Iteration  250, KL divergence 1243.6578, 50 iterations in 40.4830 sec
   --> Time elapsed: 168.94 seconds
nrows: 98
ncols: 470
Number of nan rows: 0 for 7A7swZJL0AtFghauiGLadV


51.06% of the variance has been removed by PCA for transcript 5Sg6efUjy

  outlier_vec = outlier_membership_vector(


nrows: 223
ncols: 470
Number of nan rows: 6 for 3DR5Qa40Mc17AiBYfmC29U


51.06% of the variance has been removed by PCA for transcript 0bXWB28GwN8OiqC1ykRrRX
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 2.46 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 8453.1513, 50 iterations in 36.0985 sec
Iteration  100, KL divergence 8281.5766, 50 iterations in 31.4605 sec
Iteration  150, KL divergence 8233.4261, 50 iterations in 32.9959 sec
Iteration  200, KL divergence 8212.3571, 50 iterations in 31.2729 sec
Iteration  250, KL divergence 8196.5726, 50 iterations in 30.1004 sec
   --> Time elapsed: 161.93 seconds


  outlier_vec = outlier_membership_vector(


nrows: 563
ncols: 470
Number of nan rows: 4 for 0bXWB28GwN8OiqC1ykRrRX


51.06% of the variance has been removed by PCA for transcript 13NDTKL5ZGs8cb8dojW3bz
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.67 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 3107.0315, 50 iterations in 30.3416 sec
Iteration  100, KL divergence 3048.2820, 50 iterations in 30.1698 sec
Iteration  150, KL divergence 3032.4029, 50 iterations in 29.5258 sec
Iteration  200, KL divergence 3027.5012, 50 iterations in 29.4216 sec
Iteration  250, KL divergence 3026.7413, 50 iterations in 29.5344 sec
   --> Time elapsed: 148.99 seconds


  outlier_vec = outlier_membership_vector(


nrows: 224
ncols: 470
Number of nan rows: 1 for 13NDTKL5ZGs8cb8dojW3bz


51.06% of the variance has been removed by PCA for transcript 4DUIcbw3EZpeYUC2mcxV0D
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 1.34 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 3152.0477, 50 iterations in 29.3780 sec
Iteration  100, KL divergence 3091.1465, 50 iterations in 29.3958 sec
Iteration  150, KL divergence 3078.9011, 50 iterations in 29.4653 sec
Iteration  200, KL divergence 3070.9519, 50 iterations in 29.6202 sec
Iteration  250, KL divergence 3070.9063, 50 iterations in 29.2755 sec
   --> Time elapsed: 147.14 seconds


  outlier_vec = outlier_membership_vector(


nrows: 229
ncols: 470
Number of nan rows: 2 for 4DUIcbw3EZpeYUC2mcxV0D


51.06% of the variance has been removed by PCA for transcript 5ts4p0QlyePWCgIB2W1wLf
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.59 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 3141.9900, 50 iterations in 29.7954 sec
Iteration  100, KL divergence 3082.3725, 50 iterations in 30.2343 sec
Iteration  150, KL divergence 3066.4020, 50 iterations in 29.4734 sec
Iteration  200, KL divergence 3061.5828, 50 iterations in 30.2332 sec
Iteration  250, KL divergence 3058.7763, 50 iterations in 29.5353 sec
   --> Time elapsed: 149.27 seconds


  outlier_vec = outlier_membership_vector(


nrows: 234
ncols: 470
Number of nan rows: 12 for 5ts4p0QlyePWCgIB2W1wLf


51.06% of the variance has been removed by PCA for transcript 19W5dgUcFseQZBmcVF4coc
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.50 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 2525.4591, 50 iterations in 29.3676 sec
Iteration  100, KL divergence 2472.5841, 50 iterations in 29.5732 sec
Iteration  150, KL divergence 2464.7899, 50 iterations in 29.3126 sec
Iteration  200, KL divergence 2460.4571, 50 iterations in 30.2591 sec
Iteration  250, KL divergence 2460.4342, 50 iterations in 29.4378 sec
   --> Time elapsed: 147.95 seconds


  outlier_vec = outlier_membership_vector(


nrows: 191
ncols: 470
Number of nan rows: 1 for 19W5dgUcFseQZBmcVF4coc


51.06% of the variance has been removed by PCA for transcript 3iydyD9rAb1f6rmvmgpwS4
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.88 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 6417.6438, 50 iterations in 29.5845 sec
Iteration  100, KL divergence 6289.8428, 50 iterations in 29.6887 sec
Iteration  150, KL divergence 6251.9629, 50 iterations in 31.1008 sec
Iteration  200, KL divergence 6236.5268, 50 iterations in 29.7457 sec
Iteration  250, KL divergence 6233.2843, 50 iterations in 29.7176 sec
   --> Time elapsed: 149.84 seconds


  outlier_vec = outlier_membership_vector(


nrows: 447
ncols: 470
Number of nan rows: 4 for 3iydyD9rAb1f6rmvmgpwS4


51.06% of the variance has been removed by PCA for transcript 3RT2j2BG8ILNYKjxsNhfvZ
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.91 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 5084.4325, 50 iterations in 29.4411 sec
Iteration  100, KL divergence 5016.6478, 50 iterations in 29.7401 sec
Iteration  150, KL divergence 4981.7215, 50 iterations in 29.5939 sec
Iteration  200, KL divergence 4967.6708, 50 iterations in 29.7924 sec
Iteration  250, KL divergence 4961.9712, 50 iterations in 29.6422 sec
   --> Time elapsed: 148.21 seconds


  outlier_vec = outlier_membership_vector(


nrows: 368
ncols: 470
Number of nan rows: 11 for 3RT2j2BG8ILNYKjxsNhfvZ


51.06% of the variance has been removed by PCA for transcript 4y67J0Fmgm5L7TPPsUunwo
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.84 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 3794.2064, 50 iterations in 30.9500 sec
Iteration  100, KL divergence 3711.3756, 50 iterations in 30.3168 sec
Iteration  150, KL divergence 3684.9336, 50 iterations in 31.5529 sec
Iteration  200, KL divergence 3678.0256, 50 iterations in 30.2357 sec
Iteration  250, KL divergence 3671.0109, 50 iterations in 29.7529 sec
   --> Time elapsed: 152.81 seconds


  outlier_vec = outlier_membership_vector(


nrows: 270
ncols: 470
Number of nan rows: 2 for 4y67J0Fmgm5L7TPPsUunwo


51.06% of the variance has been removed by PCA for transcript 6preEOWrgR9eRr938upFgv
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 1.09 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 8697.8447, 50 iterations in 29.7332 sec
Iteration  100, KL divergence 8520.1343, 50 iterations in 30.3540 sec
Iteration  150, KL divergence 8475.6687, 50 iterations in 29.7837 sec
Iteration  200, KL divergence 8464.7138, 50 iterations in 29.6736 sec
Iteration  250, KL divergence 8464.4083, 50 iterations in 29.8396 sec
   --> Time elapsed: 149.39 seconds


  outlier_vec = outlier_membership_vector(


nrows: 599
ncols: 470
Number of nan rows: 20 for 6preEOWrgR9eRr938upFgv


51.06% of the variance has been removed by PCA for transcript 28IWswylk2FvkebOehoCkL
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.38 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 4968.1503, 50 iterations in 30.8625 sec
Iteration  100, KL divergence 4889.5513, 50 iterations in 34.1287 sec
Iteration  150, KL divergence 4857.2517, 50 iterations in 33.1419 sec
Iteration  200, KL divergence 4851.3162, 50 iterations in 30.4855 sec
Iteration  250, KL divergence 4846.4950, 50 iterations in 29.5409 sec
   --> Time elapsed: 158.16 seconds


  outlier_vec = outlier_membership_vector(


nrows: 351
ncols: 470
Number of nan rows: 12 for 28IWswylk2FvkebOehoCkL


51.06% of the variance has been removed by PCA for transcript 2Bp5vd9GAmEpZzjEtGQBFD
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.08 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 614.9070, 50 iterations in 29.0636 sec
Iteration  100, KL divergence 612.2848, 50 iterations in 29.1315 sec
Iteration  150, KL divergence 609.9257, 50 iterations in 29.0124 sec
Iteration  200, KL divergence 609.8837, 50 iterations in 29.1229 sec
Iteration  250, KL divergence 609.8836, 50 iterations in 30.2615 sec
   --> Time elapsed: 146.59 seconds
nrows: 53
ncols: 470
Number of nan rows: 0 for 2Bp5vd9GAmEpZzjEtGQBFD


51

  outlier_vec = outlier_membership_vector(


nrows: 390
ncols: 470
Number of nan rows: 7 for 4pFaG2QLnDr95gqDQFEWoh


51.06% of the variance has been removed by PCA for transcript 53DrbE5nPJskpPT0PtOi9O
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.06 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 946.8635, 50 iterations in 29.1961 sec
Iteration  100, KL divergence 927.2398, 50 iterations in 29.1431 sec
Iteration  150, KL divergence 921.0208, 50 iterations in 29.2254 sec
Iteration  200, KL divergence 920.6364, 50 iterations in 29.2002 sec
Iteration  250, KL divergence 920.6362, 50 iterations in 29.2409 sec
   --> Time elapsed: 146.01 seconds
nrows: 75
ncols: 470
Number of nan rows: 0 for 53DrbE5nPJskpPT0PtOi9O


51.

  outlier_vec = outlier_membership_vector(


nrows: 601
ncols: 470
Number of nan rows: 22 for 0ZGQ63222rqX5TD5ZrMmcN


51.06% of the variance has been removed by PCA for transcript 7mv5E2yb2yVQU34OiQ1vqv
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 1.18 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 11041.5129, 50 iterations in 33.9136 sec
Iteration  100, KL divergence 10846.0460, 50 iterations in 33.9683 sec
Iteration  150, KL divergence 10773.4136, 50 iterations in 33.3661 sec
Iteration  200, KL divergence 10747.1482, 50 iterations in 31.9839 sec
Iteration  250, KL divergence 10737.6245, 50 iterations in 32.7934 sec
   --> Time elapsed: 166.03 seconds


  outlier_vec = outlier_membership_vector(


nrows: 747
ncols: 470
Number of nan rows: 32 for 7mv5E2yb2yVQU34OiQ1vqv


51.06% of the variance has been removed by PCA for transcript 3p9FLEH5V5sCGHhGubaYZc
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.98 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 2612.3163, 50 iterations in 32.2327 sec
Iteration  100, KL divergence 2565.5355, 50 iterations in 31.6417 sec
Iteration  150, KL divergence 2548.1779, 50 iterations in 31.7197 sec
Iteration  200, KL divergence 2546.6050, 50 iterations in 31.9051 sec
Iteration  250, KL divergence 2544.1366, 50 iterations in 31.6155 sec
   --> Time elapsed: 159.12 seconds
nrows: 187
ncols: 470
Number of nan rows: 0 for 3p9FLEH5V5sCGHhGubaYZ

  outlier_vec = outlier_membership_vector(


nrows: 560
ncols: 470
Number of nan rows: 6 for 1VBbCB6ja5pPdU2wrBy27N




### Code for individual transcript predicitons

In [2]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('first_podcast.csv.gz', compression='gzip')
documents = sentences.transcript_subset.to_list()

In [7]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [8]:
# reduce dimensions with PCA and t-SNE
PCA_model = PCA(n_components = 50)
PCA_data = PCA_model.fit_transform(embedded_sentences)
print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA")

tsne_test = tsne_data.transform(PCA_data)

38.83% of the variance has been removed by PCA
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 11.78 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 9527.4070, 50 iterations in 70.7268 sec
Iteration  100, KL divergence 9287.1780, 50 iterations in 92.7301 sec
Iteration  150, KL divergence 9191.3463, 50 iterations in 104.2941 sec
Iteration  200, KL divergence 9151.7852, 50 iterations in 87.6444 sec
Iteration  250, KL divergence 9132.1676, 50 iterations in 86.1700 sec
   --> Time elapsed: 441.57 seconds


In [9]:
# get probability vectors for each cluster via soft clustering
probabilities = hdbscan.membership_vector(hdbscan_model, tsne_test)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')
# remedy for HDBSCAN problem
inds = np.where(np.isnan(probabilities))
probabilities[inds] = 0
print(f"Number of nan rows: {len(set(list(inds[:][0])))}")

  outlier_vec = outlier_membership_vector(


nrows: 599
ncols: 156


In [11]:
# save probabilities to use in segmentation
np.save('probabilities.npy', probabilities)