# First step in pipeline
**Load HDBSCAN and t-SNE models and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [1]:
import os
import pickle
import hdbscan
import numpy as np
import pandas as pd
from openTSNE import TSNE
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [6]:
# load PCA class object
with open('pca_model_word50.pkl', 'rb') as inp:
    PCA_model = pickle.load(inp)

In [8]:
# load t-SNE class object
with open('word_tsne_data.pkl', 'rb') as inp:
    tsne_data = pickle.load(inp)

In [10]:
# loop through all of the annotated transcripts and save dim reduced vectors
directory = '../Thesis/annotated_transcripts_input'

for dirpath, _, files in os.walk(directory):
    for file in files:
        episode_id = file.split('_')[0]
        path = os.path.join(dirpath, file)
        with open(path, errors='replace') as f: 
            prediction_documents = pd.read_csv(f)
            docs = prediction_documents.transcript_subset

            # embedd
            sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
            embedded_sentences = sentence_model.encode(docs)

            # reduce dimensions with PCA and t-SNE
            PCA_data = PCA_model.transform(embedded_sentences)
            print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA for transcript {episode_id}")
            tsne_prediction = tsne_data.transform(PCA_data)

            # Write transcripts to files
            save_path = '../Thesis/annotated_dimreduced/'
            name_of_file = f'dimreduced_word50_{episode_id}.npy'
            complete_path = os.path.join(save_path, name_of_file)
            np.save(complete_path, tsne_prediction)

41.98% of the variance has been removed by PCA for transcript 7A7swZJL0AtFghauiGLadV
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 1.99 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 1274.4259, 50 iterations in 17.2795 sec
Iteration  100, KL divergence 1226.8112, 50 iterations in 19.3756 sec
Iteration  150, KL divergence 1217.9773, 50 iterations in 22.1012 sec
Iteration  200, KL divergence 1208.8139, 50 iterations in 19.8186 sec
Iteration  250, KL divergence 1202.6663, 50 iterations in 16.8164 sec
   --> Time elapsed: 95.39 seconds
41.98% of the variance has been removed by PCA for transcript 5Sg6efUjypR4m6p9eYBXpm
===> Finding 15 nearest neighbors in existing embedding usi

In [12]:
# load cluster model
with open('word50_hdbscan_model.pkl', 'rb') as inp:
    hdbscan_model = pickle.load(inp)

In [13]:
# loop through all of the annotated dim reduced embeddings and save topic probability density vector
dir = '../Thesis/annotated_dimreduced'

for dirpath, _, files in os.walk(dir):
    for file in files:
        episode_id = file.split('_')[2]
        path = os.path.join(dirpath, file)
        tsne_prediction = np.load(path)

        # get probability vectors for each cluster via soft clustering
        probabilities = hdbscan.membership_vector(hdbscan_model, tsne_prediction)
        print(f'nrows: {len(probabilities)}\nncols: {len(probabilities[0])}')
        # remedy HDBSCAN problem
        inds = np.where(np.isnan(probabilities))
        probabilities[inds] = 0
        print(f"Number of nan rows: {len(set(list(inds[:][0])))} for {episode_id}\n\n")

        # save probabilities to use in downstream segmentation
        name_of_file = f'../Thesis/annotated_probabilities/topic_probability_density_vector_{episode_id}'
        np.save(name_of_file, probabilities)

KeyboardInterrupt: 

Exception ignored in: 'hdbscan._prediction_utils.prob_in_some_cluster'
Traceback (most recent call last):
  File "/Users/oskarmunckafrosenschold/Documents/Thesis/thesisenv/lib/python3.10/site-packages/numpy/core/_methods.py", line 38, in _amax
    def _amax(a, axis=None, out=None, keepdims=False,
KeyboardInterrupt: 


KeyboardInterrupt: 

### Code for individual transcript predicitons

In [2]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('first_podcast.csv.gz', compression='gzip')
documents = sentences.transcript_subset.to_list()

In [7]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [8]:
# reduce dimensions with PCA and t-SNE
PCA_model = PCA(n_components = 50)
PCA_data = PCA_model.fit_transform(embedded_sentences)
print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA")

tsne_test = tsne_data.transform(PCA_data)

38.83% of the variance has been removed by PCA
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 11.78 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 9527.4070, 50 iterations in 70.7268 sec
Iteration  100, KL divergence 9287.1780, 50 iterations in 92.7301 sec
Iteration  150, KL divergence 9191.3463, 50 iterations in 104.2941 sec
Iteration  200, KL divergence 9151.7852, 50 iterations in 87.6444 sec
Iteration  250, KL divergence 9132.1676, 50 iterations in 86.1700 sec
   --> Time elapsed: 441.57 seconds


In [9]:
# get probability vectors for each cluster via soft clustering
probabilities = hdbscan.membership_vector(hdbscan_model, tsne_test)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')
# remedy for HDBSCAN problem
inds = np.where(np.isnan(probabilities))
probabilities[inds] = 0
print(f"Number of nan rows: {len(set(list(inds[:][0])))}")

  outlier_vec = outlier_membership_vector(


nrows: 599
ncols: 156


In [11]:
# save probabilities to use in segmentation
np.save('probabilities.npy', probabilities)