# First step in pipeline
# Get probabilities of podcasts which you want to find segments and topics of
**Load HDBSCAN and t-SNE models and find probabilities for all sentences in a document. The output of this file will be used in the segmentation**

In [1]:
import os
import pickle
import hdbscan
import numpy as np
import pandas as pd
from openTSNE import TSNE
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

In [2]:
# load PCA class object
with open('../Thesis/Final_models/pca_model_word100.pkl', 'rb') as inp:
    PCA_model = pickle.load(inp)

In [3]:
1-sum(PCA_model.explained_variance_ratio_)

0.24606509894744488

In [4]:
# load t-SNE class object
with open('../Thesis/Final_models/tsne_data_word100.pkl', 'rb') as inp:
    tsne_data = pickle.load(inp)

In [5]:
# loop through all of the annotated transcripts and save dim reduced vectors
directory = '../Thesis/annotated_transcripts_input'

for dirpath, _, files in os.walk(directory):
    for file in files:
        episode_id = file.split('_')[0]
        path = os.path.join(dirpath, file)
        with open(path, errors='replace') as f: 
            prediction_documents = pd.read_csv(f)
            docs = prediction_documents.transcript_subset

            # embedd
            sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
            embedded_sentences = sentence_model.encode(docs)

            # reduce dimensions with PCA and t-SNE
            PCA_data = PCA_model.transform(embedded_sentences)
            print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA for transcript {episode_id}")
            tsne_prediction = tsne_data.transform(PCA_data)

            # Write transcripts to files
            save_path = '../Thesis/annotated_dimreduced/'
            name_of_file = f'dimreduced_word100_{episode_id}.npy'
            complete_path = os.path.join(save_path, name_of_file)
            np.save(complete_path, tsne_prediction)

24.61% of the variance has been removed by PCA for transcript 7A7swZJL0AtFghauiGLadV
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 3.50 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.06 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 1166.6498, 50 iterations in 15.8317 sec
Iteration  100, KL divergence 1149.5606, 50 iterations in 15.7309 sec
Iteration  150, KL divergence 1138.6246, 50 iterations in 15.8113 sec
Iteration  200, KL divergence 1133.7704, 50 iterations in 19.1794 sec
Iteration  250, KL divergence 1133.1779, 50 iterations in 18.4420 sec
   --> Time elapsed: 85.00 seconds
24.61% of the variance has been removed by PCA for transcript 5Sg6efUjypR4m6p9eYBXpm
===> Finding 15 nearest neighbors in existing embedding usi

In [3]:
# load cluster model
with open('../Thesis/Final_models/hdbscan_msize15.pkl', 'rb') as inp:
    hdbscan_model = pickle.load(inp)

In [5]:
# loop through all of the annotated dim reduced embeddings and save topic probability density vector
dir = '../Thesis/annotated_dimreduced'

for dirpath, _, files in os.walk(dir):
    for file in files:
        episode_id = file[:-4].split('_')[2]
        path = os.path.join(dirpath, file)
        tsne_prediction = np.load(path)

        # get probability vectors for each cluster via soft clustering
        print(f'Starting {episode_id}')
        probabilities = hdbscan.membership_vector(hdbscan_model, tsne_prediction)
        print(f'nrows: {len(probabilities)}\nncols: {len(probabilities[0])}')
        # remedy HDBSCAN problem
        inds = np.where(np.isnan(probabilities))
        probabilities[inds] = 0
        print(f"Number of nan rows: {len(set(list(inds[:][0])))} for {episode_id}\n\n")

        # save probabilities to use in downstream segmentation
        name_of_file = f'../Thesis/annotated_probabilities_msize15/proba_dens_vec_msize15_{episode_id}'
        np.save(name_of_file, probabilities)

Starting 4y67J0Fmgm5L7TPPsUunwo
nrows: 270
ncols: 2880
Number of nan rows: 0 for 4y67J0Fmgm5L7TPPsUunwo


Starting 3RT2j2BG8ILNYKjxsNhfvZ
nrows: 368
ncols: 2880
Number of nan rows: 0 for 3RT2j2BG8ILNYKjxsNhfvZ


Starting 3p9FLEH5V5sCGHhGubaYZc
nrows: 187
ncols: 2880
Number of nan rows: 0 for 3p9FLEH5V5sCGHhGubaYZc


Starting 5Sg6efUjypR4m6p9eYBXpm
nrows: 379
ncols: 2880
Number of nan rows: 0 for 5Sg6efUjypR4m6p9eYBXpm


Starting 28IWswylk2FvkebOehoCkL
nrows: 351
ncols: 2880
Number of nan rows: 0 for 28IWswylk2FvkebOehoCkL


Starting 3DR5Qa40Mc17AiBYfmC29U
nrows: 223
ncols: 2880
Number of nan rows: 0 for 3DR5Qa40Mc17AiBYfmC29U


Starting 0bXWB28GwN8OiqC1ykRrRX
nrows: 563
ncols: 2880
Number of nan rows: 0 for 0bXWB28GwN8OiqC1ykRrRX


Starting 4DUIcbw3EZpeYUC2mcxV0D
nrows: 229
ncols: 2880
Number of nan rows: 0 for 4DUIcbw3EZpeYUC2mcxV0D


Starting 5ts4p0QlyePWCgIB2W1wLf
nrows: 234
ncols: 2880
Number of nan rows: 0 for 5ts4p0QlyePWCgIB2W1wLf


Starting 7mv5E2yb2yVQU34OiQ1vqv
nrows: 747
nco

### Code for individual transcript predicitons

In [19]:
# load dataframe of one transcript with sentences as instances
sentences = pd.read_csv('first_podcast.csv.gz', compression='gzip')
documents = sentences.transcript_subset.to_list()

In [20]:
# embedd
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embedded_sentences = sentence_model.encode(documents)

In [21]:
# reduce dimensions with PCA and t-SNE
PCA_model = PCA(n_components = 50)
PCA_data = PCA_model.fit_transform(embedded_sentences)
print(f"{1-sum(PCA_model.explained_variance_ratio_):.2%} of the variance has been removed by PCA")

tsne_test = tsne_data.transform(PCA_data)

38.88% of the variance has been removed by PCA
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 15.46 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.04 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 9553.1733, 50 iterations in 68.5026 sec
Iteration  100, KL divergence 9385.8024, 50 iterations in 75.2073 sec
Iteration  150, KL divergence 9315.4148, 50 iterations in 81.2504 sec
Iteration  200, KL divergence 9280.9488, 50 iterations in 58.4335 sec
Iteration  250, KL divergence 9266.8228, 50 iterations in 46.4842 sec
   --> Time elapsed: 329.89 seconds


In [22]:
# get probability vectors for each cluster via soft clustering
probabilities = hdbscan.membership_vector(hdbscan_model, tsne_test)
print(f'nrows: {len(probabilities)}')
print(f'ncols: {len(probabilities[0])}')
# remedy for HDBSCAN problem
inds = np.where(np.isnan(probabilities))
probabilities[inds] = 0
print(f"Number of nan rows: {len(set(list(inds[:][0])))}")

nrows: 599
ncols: 156
Number of nan rows: 0


In [23]:
# save probabilities to use in segmentation
np.save('probabilities_new.npy', probabilities)