In [2]:
# Imports
import pandas as pd
import numpy as np

from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances


In [3]:
# Load model 
topic_model = BERTopic.load("BERT_v1")

# Get topic info
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,7432,-1_oh_okay_thing_kind
1,0,530,0_god_jesus_lord_church
2,1,372,1_anxiety_mental_self_yourself
3,2,350,2_players_league_season_game
4,3,312,3_weight_diet_training_fitness
...,...,...,...
110,109,16,109_frequency_guru_baba_spiritual
111,110,16,110_asmr_breed_finian_relax
112,111,15,111_trans_bisexual_queer_gay
113,112,15,112_jewelry_premier_jewelers_jeweler


In [4]:
# Compute cosine similiarity/distance with topic embeddings

sim_matrix = cosine_similarity(topic_model.topic_embeddings_)
dis_matrix = cosine_distances(topic_model.topic_embeddings_)
df_sim = pd.DataFrame(sim_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())
df_dis = pd.DataFrame(dis_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())

df_dis

Unnamed: 0,-1_oh_okay_thing_kind,0_god_jesus_lord_church,1_anxiety_mental_self_yourself,2_players_league_season_game,3_weight_diet_training_fitness,4_police_murder_her_crime,5_anchor_podcast_album_weezer,6_podcast_life_am_feel,7_chompers_brushing_brush_mouth,8_fucking_shit_fuck_dude,...,104_leona_goose_serena_helmsley,105_flyers_dude_shit_fucking,106_data_hr_bayesian_learning,107_kobe_bryant_basketball_nba,108_nintendo_mario_castlevania_donkey,109_frequency_guru_baba_spiritual,110_asmr_breed_finian_relax,111_trans_bisexual_queer_gay,112_jewelry_premier_jewelers_jeweler,113_aquarium_botanical_fishes_aquariums
-1_oh_okay_thing_kind,0.000000,0.432465,0.366232,0.391750,0.532748,0.346066,0.273120,0.206625,0.459488,0.212507,...,0.406574,0.248400,0.426821,0.545792,0.389261,0.384023,0.345217,0.472542,0.500448,0.557012
0_god_jesus_lord_church,0.432465,0.000000,0.387190,0.479840,0.567623,0.394085,0.361907,0.357840,0.538385,0.436245,...,0.442340,0.379910,0.510473,0.534461,0.491535,0.350137,0.396731,0.558699,0.511456,0.617318
1_anxiety_mental_self_yourself,0.366232,0.387190,0.000000,0.383244,0.412560,0.342759,0.359051,0.213249,0.501371,0.481384,...,0.438316,0.410736,0.440568,0.504033,0.432365,0.305488,0.370496,0.442369,0.499640,0.548635
2_players_league_season_game,0.391750,0.479840,0.383244,0.000000,0.433805,0.336230,0.327319,0.323966,0.494066,0.499977,...,0.361072,0.356844,0.415047,0.297133,0.310695,0.422083,0.357534,0.476947,0.435410,0.533875
3_weight_diet_training_fitness,0.532748,0.567623,0.412560,0.433805,0.000000,0.457931,0.436647,0.437302,0.547709,0.648945,...,0.519065,0.529034,0.413844,0.543809,0.495557,0.410656,0.473548,0.563701,0.539737,0.462271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109_frequency_guru_baba_spiritual,0.384023,0.350137,0.305488,0.422083,0.410656,0.373471,0.312503,0.281873,0.528798,0.494854,...,0.376122,0.384556,0.355618,0.546014,0.380834,0.000000,0.350659,0.473440,0.401953,0.469444
110_asmr_breed_finian_relax,0.345217,0.396731,0.370496,0.357534,0.473548,0.334784,0.308539,0.292706,0.409398,0.473583,...,0.322000,0.383232,0.340775,0.530351,0.322883,0.350659,0.000000,0.489397,0.410282,0.444051
111_trans_bisexual_queer_gay,0.472542,0.558699,0.442369,0.476947,0.563701,0.430451,0.452750,0.380930,0.564855,0.580460,...,0.516977,0.485257,0.542509,0.592679,0.502592,0.473440,0.489397,0.000000,0.548354,0.600787
112_jewelry_premier_jewelers_jeweler,0.500448,0.511456,0.499640,0.435410,0.539737,0.403923,0.379592,0.396968,0.494665,0.598170,...,0.377154,0.456657,0.432139,0.471886,0.434402,0.401953,0.410282,0.548354,0.000000,0.521945


In [5]:
# Convert cosine distance matrix to pair-wise dataframe
df_tri = pd.DataFrame(np.triu(df_dis), columns=df_dis.columns, index=df_dis.columns)
df_long = df_tri.stack().reset_index()

# Rename the columns
df_long.columns = ['Pair_1', 'Pair_2', 'Cosine_Distance']

# Remove rows where Pair_1 is equal to Pair_2
df_long = df_long[df_long['Pair_1'] != df_long['Pair_2']]
df_long = df_long[df_long.Cosine_Distance !=0]

print('Shape:', df_long.shape)
df_long.head(10)

Shape: (6555, 3)


Unnamed: 0,Pair_1,Pair_2,Cosine_Distance
1,-1_oh_okay_thing_kind,0_god_jesus_lord_church,0.432465
2,-1_oh_okay_thing_kind,1_anxiety_mental_self_yourself,0.366232
3,-1_oh_okay_thing_kind,2_players_league_season_game,0.39175
4,-1_oh_okay_thing_kind,3_weight_diet_training_fitness,0.532748
5,-1_oh_okay_thing_kind,4_police_murder_her_crime,0.346066
6,-1_oh_okay_thing_kind,5_anchor_podcast_album_weezer,0.27312
7,-1_oh_okay_thing_kind,6_podcast_life_am_feel,0.206625
8,-1_oh_okay_thing_kind,7_chompers_brushing_brush_mouth,0.459488
9,-1_oh_okay_thing_kind,8_fucking_shit_fuck_dude,0.212507
10,-1_oh_okay_thing_kind,9_afterbuzz_tv_she_her,0.283148
