In [26]:
# Imports
import pandas as pd
import numpy as np


from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import pairwise_distances

In [3]:
# Load data into df
transcripts = pd.read_csv('transcripts_sample.csv.gz', compression='gzip')

In [6]:
docs = list(transcripts['transcript'])

print(len(docs))

15000


In [8]:
# Load model 
topic_model = BERTopic.load("BERT_v1")

In [9]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,7432,-1_oh_okay_thing_kind
1,0,530,0_god_jesus_lord_church
2,1,372,1_anxiety_mental_self_yourself
3,2,350,2_players_league_season_game
4,3,312,3_weight_diet_training_fitness
...,...,...,...
110,109,16,109_frequency_guru_baba_spiritual
111,110,16,110_asmr_breed_finian_relax
112,111,15,111_trans_bisexual_queer_gay
113,112,15,112_jewelry_premier_jewelers_jeweler


In [10]:
topic_model.visualize_heatmap()

In [15]:
embeddings = np.array(topic_model.embedding_model)


In [18]:

# distance_matrix = cosine_similarity(np.array(topic_model.embedding_model)[1:, :])
labels = (topic_model.get_topic_info().sort_values("Topic", ascending=True).Name)[1:]


In [24]:

sim_matrix = cosine_similarity(topic_model.c_tf_idf_)
dis_matrix = cosine_distances(topic_model.c_tf_idf_)
df_sim = pd.DataFrame(sim_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())
df_dis = pd.DataFrame(dis_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())

In [25]:
df_dis.head(50)

Unnamed: 0,-1_oh_okay_thing_kind,0_god_jesus_lord_church,1_anxiety_mental_self_yourself,2_players_league_season_game,3_weight_diet_training_fitness,4_police_murder_her_crime,5_anchor_podcast_album_weezer,6_podcast_life_am_feel,7_chompers_brushing_brush_mouth,8_fucking_shit_fuck_dude,...,104_leona_goose_serena_helmsley,105_flyers_dude_shit_fucking,106_data_hr_bayesian_learning,107_kobe_bryant_basketball_nba,108_nintendo_mario_castlevania_donkey,109_frequency_guru_baba_spiritual,110_asmr_breed_finian_relax,111_trans_bisexual_queer_gay,112_jewelry_premier_jewelers_jeweler,113_aquarium_botanical_fishes_aquariums
-1_oh_okay_thing_kind,0.0,0.206815,0.191612,0.180846,0.180448,0.238697,0.137323,0.153728,0.394715,0.147091,...,0.423833,0.326515,0.388975,0.381609,0.385323,0.429204,0.479043,0.368586,0.379364,0.523234
0_god_jesus_lord_church,0.206815,0.0,0.265291,0.325747,0.308875,0.327624,0.271817,0.275085,0.47154,0.293449,...,0.483322,0.418151,0.472774,0.479704,0.480546,0.445629,0.524004,0.426377,0.449712,0.583245
1_anxiety_mental_self_yourself,0.191612,0.265291,0.0,0.309939,0.213186,0.344524,0.231206,0.214936,0.44831,0.264602,...,0.468158,0.383942,0.392599,0.457325,0.453644,0.364276,0.512213,0.386554,0.401231,0.533989
2_players_league_season_game,0.180846,0.325747,0.309939,0.0,0.281368,0.344936,0.259034,0.272394,0.474246,0.250891,...,0.48819,0.383645,0.454946,0.406395,0.445574,0.514288,0.531013,0.456629,0.4441,0.579923
3_weight_diet_training_fitness,0.180448,0.308875,0.213186,0.281368,0.0,0.367451,0.228919,0.222179,0.44639,0.254278,...,0.49374,0.384486,0.403058,0.44893,0.452386,0.4337,0.524667,0.414639,0.413457,0.527106
4_police_murder_her_crime,0.238697,0.327624,0.344524,0.344936,0.367451,0.0,0.314444,0.330925,0.460822,0.332979,...,0.471661,0.481746,0.536212,0.519736,0.519562,0.557482,0.533468,0.498573,0.502337,0.609054
5_anchor_podcast_album_weezer,0.137323,0.271817,0.231206,0.259034,0.228919,0.314444,0.0,0.192192,0.425778,0.19346,...,0.451618,0.330361,0.416296,0.394313,0.406063,0.431478,0.496899,0.370464,0.392447,0.547994
6_podcast_life_am_feel,0.153728,0.275085,0.214936,0.272394,0.222179,0.330925,0.192192,0.0,0.42482,0.202922,...,0.451637,0.341306,0.403914,0.417251,0.417216,0.407543,0.502918,0.362755,0.385386,0.548226
7_chompers_brushing_brush_mouth,0.394715,0.47154,0.44831,0.474246,0.44639,0.460822,0.425778,0.42482,0.0,0.43126,...,0.576679,0.515564,0.575296,0.573126,0.563158,0.586223,0.605404,0.549342,0.553207,0.639816
8_fucking_shit_fuck_dude,0.147091,0.293449,0.264602,0.250891,0.254278,0.332979,0.19346,0.202922,0.43126,0.0,...,0.456023,0.331119,0.45178,0.39343,0.409227,0.471943,0.516456,0.380799,0.416604,0.570015


In [30]:
# sklearn.metrics.pairwise_distances(X, Y=None, metric='euclidean', *, n_jobs=None, force_all_finite=True, **kwds)

df_pair = pd.DataFrame(pairwise_distances(topic_model.c_tf_idf_, metric='cosine'))

In [32]:
topic_model.c_tf_idf_

<115x158875 sparse matrix of type '<class 'numpy.float64'>'
	with 1356138 stored elements in Compressed Sparse Row format>

In [31]:
df_pair.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,105,106,107,108,109,110,111,112,113,114
0,0.0,0.206815,0.191612,0.180846,0.180448,0.238697,0.137323,0.153728,0.394715,0.147091,...,0.423833,0.326515,0.388975,0.381609,0.385323,0.429204,0.479043,0.368586,0.379364,0.523234
1,0.206815,0.0,0.265291,0.325747,0.308875,0.327624,0.271817,0.275085,0.47154,0.293449,...,0.483322,0.418151,0.472774,0.479704,0.480546,0.445629,0.524004,0.426377,0.449712,0.583245
2,0.191612,0.265291,0.0,0.309939,0.213186,0.344524,0.231206,0.214936,0.44831,0.264602,...,0.468158,0.383942,0.392599,0.457325,0.453644,0.364276,0.512213,0.386554,0.401231,0.533989
3,0.180846,0.325747,0.309939,0.0,0.281368,0.344936,0.259034,0.272394,0.474246,0.250891,...,0.48819,0.383645,0.454946,0.406395,0.445574,0.514288,0.531013,0.456629,0.4441,0.579923
4,0.180448,0.308875,0.213186,0.281368,0.0,0.367451,0.228919,0.222179,0.44639,0.254278,...,0.49374,0.384486,0.403058,0.44893,0.452386,0.4337,0.524667,0.414639,0.413457,0.527106
5,0.238697,0.327624,0.344524,0.344936,0.367451,0.0,0.314444,0.330925,0.460822,0.332979,...,0.471661,0.481746,0.536212,0.519736,0.519562,0.557482,0.533468,0.498573,0.502337,0.609054
6,0.137323,0.271817,0.231206,0.259034,0.228919,0.314444,0.0,0.192192,0.425778,0.19346,...,0.451618,0.330361,0.416296,0.394313,0.406063,0.431478,0.496899,0.370464,0.392447,0.547994
7,0.153728,0.275085,0.214936,0.272394,0.222179,0.330925,0.192192,0.0,0.42482,0.202922,...,0.451637,0.341306,0.403914,0.417251,0.417216,0.407543,0.502918,0.362755,0.385386,0.548226
8,0.394715,0.47154,0.44831,0.474246,0.44639,0.460822,0.425778,0.42482,0.0,0.43126,...,0.576679,0.515564,0.575296,0.573126,0.563158,0.586223,0.605404,0.549342,0.553207,0.639816
9,0.147091,0.293449,0.264602,0.250891,0.254278,0.332979,0.19346,0.202922,0.43126,0.0,...,0.456023,0.331119,0.45178,0.39343,0.409227,0.471943,0.516456,0.380799,0.416604,0.570015


In [38]:
# topic_model.topic_embeddings_

len(topic_model.topic_embeddings_[0])

384

In [53]:

sim_matrix = cosine_similarity(topic_model.topic_embeddings_)
dis_matrix = cosine_distances(topic_model.topic_embeddings_)
df_sim = pd.DataFrame(sim_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())
df_dis = pd.DataFrame(dis_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())

In [54]:
df_dis

# df_sim

Unnamed: 0,-1_oh_okay_thing_kind,0_god_jesus_lord_church,1_anxiety_mental_self_yourself,2_players_league_season_game,3_weight_diet_training_fitness,4_police_murder_her_crime,5_anchor_podcast_album_weezer,6_podcast_life_am_feel,7_chompers_brushing_brush_mouth,8_fucking_shit_fuck_dude,...,104_leona_goose_serena_helmsley,105_flyers_dude_shit_fucking,106_data_hr_bayesian_learning,107_kobe_bryant_basketball_nba,108_nintendo_mario_castlevania_donkey,109_frequency_guru_baba_spiritual,110_asmr_breed_finian_relax,111_trans_bisexual_queer_gay,112_jewelry_premier_jewelers_jeweler,113_aquarium_botanical_fishes_aquariums
-1_oh_okay_thing_kind,0.000000,0.432465,0.366232,0.391750,0.532748,0.346066,0.273120,0.206625,0.459488,0.212507,...,0.406574,0.248400,0.426821,0.545792,0.389261,0.384023,0.345217,0.472542,0.500448,0.557012
0_god_jesus_lord_church,0.432465,0.000000,0.387190,0.479840,0.567623,0.394085,0.361907,0.357840,0.538385,0.436245,...,0.442340,0.379910,0.510473,0.534461,0.491535,0.350137,0.396731,0.558699,0.511456,0.617318
1_anxiety_mental_self_yourself,0.366232,0.387190,0.000000,0.383244,0.412560,0.342759,0.359051,0.213249,0.501371,0.481384,...,0.438316,0.410736,0.440568,0.504033,0.432365,0.305488,0.370496,0.442369,0.499640,0.548635
2_players_league_season_game,0.391750,0.479840,0.383244,0.000000,0.433805,0.336230,0.327319,0.323966,0.494066,0.499977,...,0.361072,0.356844,0.415047,0.297133,0.310695,0.422083,0.357534,0.476947,0.435410,0.533875
3_weight_diet_training_fitness,0.532748,0.567623,0.412560,0.433805,0.000000,0.457931,0.436647,0.437302,0.547709,0.648945,...,0.519065,0.529034,0.413844,0.543809,0.495557,0.410656,0.473548,0.563701,0.539737,0.462271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109_frequency_guru_baba_spiritual,0.384023,0.350137,0.305488,0.422083,0.410656,0.373471,0.312503,0.281873,0.528798,0.494854,...,0.376122,0.384556,0.355618,0.546014,0.380834,0.000000,0.350659,0.473440,0.401953,0.469444
110_asmr_breed_finian_relax,0.345217,0.396731,0.370496,0.357534,0.473548,0.334784,0.308539,0.292706,0.409398,0.473583,...,0.322000,0.383232,0.340775,0.530351,0.322883,0.350659,0.000000,0.489397,0.410282,0.444051
111_trans_bisexual_queer_gay,0.472542,0.558699,0.442369,0.476947,0.563701,0.430451,0.452750,0.380930,0.564855,0.580460,...,0.516977,0.485257,0.542509,0.592679,0.502592,0.473440,0.489397,0.000000,0.548354,0.600787
112_jewelry_premier_jewelers_jeweler,0.500448,0.511456,0.499640,0.435410,0.539737,0.403923,0.379592,0.396968,0.494665,0.598170,...,0.377154,0.456657,0.432139,0.471886,0.434402,0.401953,0.410282,0.548354,0.000000,0.521945


In [42]:
df_pair = pd.DataFrame(pairwise_distances(topic_model.topic_embeddings_, metric='cosine'))

In [43]:
df_pair

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,105,106,107,108,109,110,111,112,113,114
0,0.000000,0.432465,0.366232,0.391750,0.532748,0.346066,0.273120,0.206625,0.459488,0.212507,...,0.406574,0.248400,0.426821,0.545792,0.389261,0.384023,3.452168e-01,0.472542,5.004477e-01,0.557012
1,0.432465,0.000000,0.387190,0.479840,0.567623,0.394085,0.361907,0.357840,0.538385,0.436245,...,0.442340,0.379910,0.510473,0.534461,0.491535,0.350137,3.967311e-01,0.558699,5.114559e-01,0.617318
2,0.366232,0.387190,0.000000,0.383244,0.412560,0.342759,0.359051,0.213249,0.501371,0.481384,...,0.438316,0.410736,0.440568,0.504033,0.432365,0.305488,3.704964e-01,0.442369,4.996402e-01,0.548635
3,0.391750,0.479840,0.383244,0.000000,0.433805,0.336230,0.327319,0.323966,0.494066,0.499977,...,0.361072,0.356844,0.415047,0.297133,0.310695,0.422083,3.575338e-01,0.476947,4.354101e-01,0.533875
4,0.532748,0.567623,0.412560,0.433805,0.000000,0.457931,0.436647,0.437302,0.547709,0.648945,...,0.519065,0.529034,0.413844,0.543809,0.495557,0.410656,4.735481e-01,0.563701,5.397374e-01,0.462271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,0.384023,0.350137,0.305488,0.422083,0.410656,0.373471,0.312503,0.281873,0.528798,0.494854,...,0.376122,0.384556,0.355618,0.546014,0.380834,0.000000,3.506590e-01,0.473440,4.019528e-01,0.469444
111,0.345217,0.396731,0.370496,0.357534,0.473548,0.334784,0.308539,0.292706,0.409398,0.473583,...,0.322000,0.383232,0.340775,0.530351,0.322883,0.350659,1.110223e-16,0.489397,4.102823e-01,0.444051
112,0.472542,0.558699,0.442369,0.476947,0.563701,0.430451,0.452750,0.380930,0.564855,0.580460,...,0.516977,0.485257,0.542509,0.592679,0.502592,0.473440,4.893966e-01,0.000000,5.483535e-01,0.600787
113,0.500448,0.511456,0.499640,0.435410,0.539737,0.403923,0.379592,0.396968,0.494665,0.598170,...,0.377154,0.456657,0.432139,0.471886,0.434402,0.401953,4.102823e-01,0.548354,3.330669e-16,0.521945


In [70]:

df_tri = pd.DataFrame(np.triu(df_dis), columns=df_dis.columns, index=df_dis.columns)


In [71]:
df_long = df_tri.stack().reset_index()

In [75]:
# Rename the columns
df_long.columns = ['Pair_1', 'Pair_2', 'Cosine_Distance']

# Remove rows where Pair_1 is equal to Pair_2
df_long = df_long[df_long['Pair_1'] != df_long['Pair_2']]

df_long = df_long[df_long.Cosine_Distance !=0]

In [77]:
df_long.shape

(6555, 3)

In [76]:
df_long.iloc[110:120]

Unnamed: 0,Pair_1,Pair_2,Cosine_Distance
111,-1_oh_okay_thing_kind,110_asmr_breed_finian_relax,0.345217
112,-1_oh_okay_thing_kind,111_trans_bisexual_queer_gay,0.472542
113,-1_oh_okay_thing_kind,112_jewelry_premier_jewelers_jeweler,0.500448
114,-1_oh_okay_thing_kind,113_aquarium_botanical_fishes_aquariums,0.557012
117,0_god_jesus_lord_church,1_anxiety_mental_self_yourself,0.38719
118,0_god_jesus_lord_church,2_players_league_season_game,0.47984
119,0_god_jesus_lord_church,3_weight_diet_training_fitness,0.567623
120,0_god_jesus_lord_church,4_police_murder_her_crime,0.394085
121,0_god_jesus_lord_church,5_anchor_podcast_album_weezer,0.361907
122,0_god_jesus_lord_church,6_podcast_life_am_feel,0.35784
