In [1]:
# Imports
import pandas as pd
import numpy as np

from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [8]:
# Load model 
topic_model = BERTopic.load("BERT_v1")

# Load sample data
transcripts = pd.read_csv('transcripts_sample.csv.gz', compression='gzip')

# Create list of transcripts
docs = list(transcripts['transcript'])

# Get document info
topic_doc = topic_model.get_document_info(docs)

In [21]:
# Filter out outlier

topic_doc_filter = topic_doc[topic_doc['Topic'] != -1]
topic_doc_filter.head()

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,Hello and welcome along to the property Academ...,13,13_filipinos_filipino_hindi_philippines,filipinos - filipino - hindi - philippines - i...,0.382109,False
1,"Good morning, everyone. This is Trinity here a...",62,62_ginger_deck_tarot_tepper,ginger - deck - tarot - tepper - 2020 - 2019 -...,0.957442,False
2,"Hey guys, it's Peter fry and welcome to the li...",5,5_anchor_podcast_album_weezer,anchor - podcast - album - weezer - song - guy...,1.0,False
6,You have turned into functional fun. I'm Mike ...,14,14_insulin_fasting_keto_fat,insulin - fasting - keto - fat - intermittent ...,0.138242,False
8,"Hey everyone, before we continue with the show...",1,1_anxiety_mental_self_yourself,anxiety - mental - self - yourself - life - de...,0.915268,False


In [22]:
unique_topics = list(topic_doc_filter.Topic.unique())
print(unique_topics)

[13, 62, 5, 14, 1, 0, 20, 17, 67, 80, 49, 28, 58, 85, 61, 10, 18, 47, 95, 9, 87, 3, 104, 22, 2, 15, 39, 101, 26, 4, 29, 90, 56, 68, 55, 23, 74, 31, 69, 21, 51, 112, 12, 54, 16, 7, 6, 32, 102, 100, 33, 38, 45, 25, 11, 27, 71, 94, 40, 36, 24, 19, 50, 37, 65, 59, 53, 48, 93, 30, 91, 8, 60, 109, 43, 83, 57, 76, 105, 103, 113, 96, 89, 72, 35, 73, 77, 64, 106, 34, 84, 46, 92, 66, 52, 82, 98, 63, 110, 79, 88, 41, 44, 97, 86, 99, 81, 42, 70, 108, 111, 107, 75, 78]


In [10]:
doc_embeddings = topic_model.transform(docs)

In [None]:
# Compute cosine similiarity/distance with document embeddings

sim_matrix = cosine_similarity(doc_embeddings)
df_sim = pd.DataFrame(sim_matrix, columns=transcripts.episode_id.values(), columns=transcripts.episode_id.values())

In [None]:
# Convert cosine distance matrix to pair-wise dataframe
df_tri = pd.DataFrame(np.triu(df_sim), columns=df_sim.columns, index=df_sim.columns)
df_long = df_tri.stack().reset_index()

# Rename the columns
df_long.columns = ['Pair_1', 'Pair_2', 'Topic' 'Cosine_Distance']

# Remove rows where Pair_1 is equal to Pair_2
df_long = df_long[df_long['Pair_1'] != df_long['Pair_2']]
df_long = df_long[df_long.Cosine_Distance !=0]

print('Shape:', df_long.shape)
df_long.head(10)

In [23]:
# Get document embeddings for each topic
topic_embeddings = []
for topic_id in unique_topics:
    docs = topic_doc_filter.Document.where('Topic' == topic_id)
    embeddings = topic_model.transform(docs)
    topic_embeddings.append(embeddings)

# Compute cosine similarity matrix for each topic
similarity_matrices = []
for embeddings in topic_embeddings:
    sim_matrix = cosine_similarity(embeddings)
    similarity_matrices.append(sim_matrix)

# Create DataFrame to store similarity matrices
topic_labels = topic_model.get_topic_freq().index
dfs = []
for i, sim_matrix in enumerate(similarity_matrices):
    topic_label = topic_labels[i]
    df = pd.DataFrame(sim_matrix, columns=range(len(topic_model.get_documents(topic_label))),
                      index=range(len(topic_model.get_documents(topic_label))))
    df.columns = [f'{topic_label}_doc_{idx}' for idx in df.columns]
    df.index = [f'{topic_label}_doc_{idx}' for idx in df.index]
    dfs.append(df)

# Combine similarity matrices for all topics
df_cosine_similarities = pd.concat(dfs, axis=0)


ValueError: Array conditional must be same shape as self