## Objective

- To evaluate if an embedding space representation of user will allow us to make accurate inferences about their preferences

### Conclusion

- It doesn't. Even discount the issue related to potential slow updates of preferences using feedback, if the user preferences are captured in clusters far away from each other, then these clusters are actually farther in distance than few other clusters - giving an inaccurate picture of user preferences
- Hence we'll go with TS based representation per cluster for each user

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from copy import deepcopy
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from datasets import load_dataset
from umap import UMAP
import re
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import plotly.io as pio
pio.renderers.default = 'iframe'
import dill

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [139]:
class EmbeddingsClusterTopics:
    def __init__(self, model_name, dataset_path, documents_column_name, embeddings_column_name, clustering_type = 'hdbscan', random_state = None):
        self.model_name = model_name
        self.embeddings_model = SentenceTransformer(self.model_name)
        custom_umap_model = UMAP(n_neighbors=15, n_components=10, random_state=random_state)  # Change 10 to the desired number of dimensions
        if clustering_type == 'hdbscan':
            custom_hdbscan_model = HDBSCAN(metric = 'manhattan')
        elif clustering_type == 'kmeans':
            custom_hdbscan_model = KMeans(n_clusters=25, random_state=random_state)
        vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
        representation_model = KeyBERTInspired()
        self.bertopic_model = BERTopic(# representation_model=representation_model,
                                      calculate_probabilities = True)
        self.dataset = load_dataset('parquet',data_files =dataset_path)['train']
        self.documents = self._load_documents_from_parquet(documents_column_name)
        self.documents = [self._remove_numeric_words(doc) for doc in self.documents]
        self.embeddings = self._load_embeddings_from_parquet(embeddings_column_name)
        self.create_clusters_topics()
        self.generate_topic_names()

    def _remove_numeric_words(self, text):
        # Remove currency-based numbers like $123.1, currency symbols like €, and rupee symbol ₹
        currency_pattern = r'\$\s*\d+(\.\d+)?|\€\s*\d+(\.\d+)?|₹\s*\d+(\.\d+)?'

        # Match numeric words or currency-based numbers
        numeric_pattern = r'\b\d+(\.\d+)?\b'

        # Combine both patterns using negative lookahead to exclude percentages
        combined_pattern = rf'(?!(?:\d+(\.\d+)?%))({currency_pattern}|{numeric_pattern})'

        cleaned_text = re.sub(combined_pattern, '', text)
        return cleaned_text

    def _load_embeddings_from_parquet(self, embeddings_column_name):
        return np.array(self.dataset[embeddings_column_name])

    def _load_documents_from_parquet(self, documents_column_name):
        return self.dataset[documents_column_name]

    def create_clusters_topics(self):
        topics, _ = self.bertopic_model.fit_transform(documents = self.documents, embeddings = self.embeddings)
        self.hierarchical_topics = self.bertopic_model.hierarchical_topics(self.documents)

    def generate_topic_names(self):
        return self.bertopic_model.generate_topic_labels(nr_words=5, separator=", ")

In [3]:
class TopicHierarchy:
    def __init__(self, df, topic_to_doc_indices):
        self.df = df
        # self.levels = {}
        self.raw_leaf_points_count = {}
        self.raw_leaf_points_list = {}
        self.topic_to_doc_indices = topic_to_doc_indices

    def compute_levels(self, parent_id, level, levels):
        levels[parent_id] = level
        children = self.df[self.df['Parent_ID'] == parent_id]

        for _, child in children.iterrows():
            self.compute_levels(child['Child_Left_ID'], level + 1, levels)
            self.compute_levels(child['Child_Right_ID'], level + 1, levels)

    def compute_raw_leaf_points(self, parent_id):
        if parent_id in self.raw_leaf_points_count:
            return self.raw_leaf_points_count[parent_id], self.raw_leaf_points_list[parent_id]

        children = self.df[self.df['Parent_ID'] == parent_id]

        if children.empty:
            parent_id_int = int(parent_id)
            if parent_id_int in self.topic_to_doc_indices:
                doc_indices = [idx for idx, x in enumerate(self.topic_to_doc_indices) if x == parent_id_int]
                count = len(doc_indices)
            else:
                count = 0
                doc_indices = []
            self.raw_leaf_points_count[parent_id] = count
            self.raw_leaf_points_list[parent_id] = doc_indices
            return count, doc_indices

        total_leaf_points = 0
        all_leaf_points = []

        for _, child in children.iterrows():
            left_count, left_list = self.compute_raw_leaf_points(child['Child_Left_ID'])
            right_count, right_list = self.compute_raw_leaf_points(child['Child_Right_ID'])

            total_leaf_points += left_count + right_count
            all_leaf_points.extend(left_list)
            all_leaf_points.extend(right_list)

        self.raw_leaf_points_count[parent_id] = total_leaf_points
        self.raw_leaf_points_list[parent_id] = all_leaf_points

        return total_leaf_points, all_leaf_points

    def get_levels(self):
        levels = {}
        all_child_ids = set(self.df['Child_Left_ID']).union(set(self.df['Child_Right_ID']))
        roots = self.df[~self.df['Parent_ID'].isin(all_child_ids)]
        for _, root in roots.iterrows():
            self.compute_levels(root['Parent_ID'], 0, levels)
        return levels

    def get_raw_leaf_points(self):
        roots = self.df[~self.df['Parent_ID'].isin(self.df['Child_Left_ID']) & ~self.df['Parent_ID'].isin(self.df['Child_Right_ID'])]
        for _, root in roots.iterrows():
            self.compute_raw_leaf_points(root['Parent_ID'])

        return self.raw_leaf_points_count, self.raw_leaf_points_list

In [4]:
def get_balanced_clusters(df, parent_id, max_points=4000):
    balanced_clusters = []
    cluster_row = df[df['Parent_ID'] == parent_id].iloc[0]
    num_points = cluster_row['num_points']

    if num_points <= max_points:
        balanced_clusters.append(cluster_row['Parent_ID'])
        return balanced_clusters

    children = df[df['Parent_ID'] == parent_id]

    for _, child in children.iterrows():
        balanced_clusters += get_balanced_clusters(df, child['Child_Left_ID'], max_points)
        balanced_clusters += get_balanced_clusters(df, child['Child_Right_ID'], max_points)

    return balanced_clusters

In [23]:
def assign_outliers_to_balanced_clusters(balanced_clusters_df, document_topic, probabilities):
    # Prepare a topic-cluster mapping for fast lookups
    topic_cluster_map = {}
    for _, row in balanced_clusters_df.iterrows():
        for topic in row['Topics']:
            topic_cluster_map[topic] = row['Parent_ID']

    # Initialize lists to store results
    original_indices = []
    new_clusters = []
    total_probs = []

    # Find the indices of documents that are outliers (-1)
    # outlier_indices = np.where(np.array(document_topic) == -1)[0]
    outlier_indices = np.where(np.array(document_topic) > -2)[0]

    # For each outlier, find the most probable cluster
    for idx in outlier_indices:
        topic_probs = np.array(probabilities[idx])
        cluster_indices = [topic_cluster_map.get(t, -1) for t in range(len(topic_probs))]

        # Create a DataFrame for aggregation
        df = pd.DataFrame({
            'Cluster': cluster_indices,
            'Probability': topic_probs
        })

        # Sum probabilities by cluster
        df_grouped = df.groupby('Cluster').sum()

        # Find the cluster with the maximum total probability
        best_cluster = df_grouped['Probability'].idxmax()

        # Append to lists
        original_indices.append(idx)
        new_clusters.append(best_cluster)
        total_probs.append(df_grouped.loc[best_cluster, 'Probability'])

    # Create a DataFrame for the results
    return pd.DataFrame({
        'original_index': original_indices,
        'new_cluster': new_clusters,
        'total_probability': total_probs
    })

In [6]:
with open("ect_bge_small.dill", "rb") as f:
    ect_bge_small = dill.load(f)

In [11]:
# folder_path = '/Users/ravi.tej/Desktop/ML/Recommendations/Embedding Model Selection/Embeddings/'
# file = 'formatted_articles_data_2023_embeddings_bge_small_en.parquet'
# ect_bge_small = EmbeddingsClusterTopics(model_name = 'BAAI/bge-small-en', 
#                                         dataset_path = folder_path + file, 
#                                         documents_column_name = 'title_summary',
#                                         embeddings_column_name = 'embeddings', 
#                                         clustering_type='hdbscan',
#                                         random_state=86)

In [12]:
bge_hierarchy = deepcopy(ect_bge_small.hierarchical_topics)

In [13]:
bge_topic_hierarchy = TopicHierarchy(bge_hierarchy, ect_bge_small.bertopic_model.topics_)
bge_levels = bge_topic_hierarchy.get_levels()
num_points, points = bge_topic_hierarchy.get_raw_leaf_points()

# Updating DataFrame as before
bge_hierarchy['Level'] = bge_hierarchy['Parent_ID'].map(bge_levels)
bge_hierarchy['num_points'] = bge_hierarchy['Parent_ID'].map(num_points)
bge_hierarchy['points'] = bge_hierarchy['Parent_ID'].map(points)

In [14]:
balanced_clusters = get_balanced_clusters(bge_hierarchy,parent_id = '1892', max_points = 3000)
len(balanced_clusters)

In [26]:
document_cluster_df = assign_outliers_to_balanced_clusters(bge_hierarchy[bge_hierarchy.Parent_ID.isin(balanced_clusters)],
                                                         document_topic=ect_bge_small.bertopic_model.topics_,
                                                         probabilities=ect_bge_small.bertopic_model.probabilities_)

In [32]:
topic_cluster_map = {}
for _, row in bge_hierarchy[bge_hierarchy.Parent_ID.isin(balanced_clusters)].iterrows():
    for topic in row['Topics']:
        topic_cluster_map[topic] = row['Parent_ID']

In [31]:
bge_hierarchy[bge_hierarchy.Parent_ID.isin(balanced_clusters)]

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance,Level,num_points,points
932,1879,hdfc_bajaj_sbi_finserv_merger,"[47, 378, 386, 387, 393, 422, 426, 471, 537, 5...",1571,bajaj_finserv_auto_finance_closed,1847,hdfc_sbi_merger_life_closed,1.921579,3,529,"[2601, 6438, 8479, 9768, 10925, 11193, 11272, ..."
929,1876,closed_yesterdays_stock_reacts_monitor,"[64, 101, 116, 140, 148, 182, 252, 255, 271, 2...",1875,closed_yesterdays_stock_reacts_monitor,1032,bharti_airtel_airtels_closed_yesterdays,1.845908,3,2571,"[3487, 6294, 6941, 7265, 7477, 7576, 7601, 811..."
919,1866,nifty_sensex_pts_indices_stocks,"[41, 56, 57, 58, 75, 122, 124, 125, 138, 150, ...",1842,nifty_sgx_stocks_nse_resistance,1672,sensex_pts_nifty_points_indices,1.734695,5,1851,"[1373, 1670, 1772, 1786, 3558, 4327, 15474, 17..."
918,1865,tcs_infosys_lrs_remittance_remittances,"[112, 243, 366, 503, 510, 643, 677, 742, 766, ...",1311,tcs_lrs_remittance_remittances_liberalised,1131,infosys_tcs_danske_closed_q4,1.726611,2,304,"[290, 491, 524, 619, 631, 815, 1339, 1500, 159..."
917,1864,trump_ukraine_covid_biden_president,"[12, 29, 53, 74, 91, 105, 120, 145, 153, 176, ...",1839,trump_ukraine_president_biden_donald,1519,covid_cases_xbb_virus_deaths,1.726429,7,2079,"[1378, 4715, 12398, 15908, 18413, 18995, 19952..."
916,1863,visa_result_college_students_class,"[33, 68, 70, 104, 109, 128, 161, 179, 201, 204...",1836,visa_result_students_class_exam,1814,college_forex_ways_money_your,1.715118,10,1699,"[18837, 22963, 36200, 36722, 39596, 44655, 447..."
913,1860,gold_crude_dollar_inflation_oil,"[20, 21, 25, 40, 42, 51, 60, 69, 76, 77, 98, 1...",1744,crude_oil_barrel_cents_brent,1858,gold_dollar_inflation_rupee_rate,1.690665,9,2601,"[2204, 4171, 9056, 10473, 10995, 11100, 11455,..."
912,1859,vande_train_bharat_rainfall_air,"[18, 19, 30, 45, 50, 59, 90, 96, 97, 134, 156,...",1822,rainfall_cyclone_imd_heavy_biparjoy,1852,vande_train_bharat_air_railways,1.686271,8,2361,"[526, 563, 1010, 1981, 2140, 3164, 5577, 14144..."
902,1849,insurance_multibagger_dividend_fiis_shares,"[16, 17, 24, 43, 78, 80, 114, 141, 149, 151, 1...",1812,insurance_life_lic_insurers_irdai,1802,multibagger_dividend_fiis_shares_smallcap,1.599241,12,2490,"[2555, 10351, 11354, 28943, 29184, 29602, 2975..."
899,1846,maruti_suzuki_hyundai_exshowroom_suv,"[52, 95, 137, 183, 186, 187, 188, 192, 217, 23...",1787,maruti_suzuki_jimny_fronx_toyota,1818,hyundai_hero_honda_kia_exshowroom,1.590759,5,1393,"[8979, 12218, 12316, 14310, 14390, 14399, 1471..."


In [35]:
document_cluster_df['original_cluster'] = ect_bge_small.bertopic_model.topics_
document_cluster_df['original_top_level_cluster'] = document_cluster_df['original_cluster'].apply(lambda x: topic_cluster_map[x] if x > -1 else -1)

### Observation

- there are 3.5k (8.3%) documents where the top level cluster with highest total probability is not the same as the top level cluster which has their topic cluster
- we'll still refer to the original topic cluster in these cases so that the topic can be retained

In [37]:
document_cluster_df[(document_cluster_df.original_cluster != -1) & (document_cluster_df.new_cluster != document_cluster_df.original_top_level_cluster)]

Unnamed: 0,original_index,new_cluster,total_probability,original_cluster,original_top_level_cluster
33,33,1757,0.136920,9,1815
58,58,1832,0.117213,542,1834
64,64,1757,0.029588,165,1737
75,75,1832,0.082457,147,1834
81,81,1757,0.053063,98,1860
...,...,...,...,...,...
67808,67808,1846,0.189128,6,1815
67846,67846,1757,0.067500,35,1840
67853,67853,1849,0.045679,62,1840
67861,67861,1757,0.059107,208,1781


In [42]:
ect_bge_small.bertopic_model.topic_embeddings_

array([[-0.19513488, -0.14188642,  0.0054219 , ..., -0.30389524,
         0.06779389,  0.25872425],
       [-0.32999562, -0.05970995, -0.09521461, ..., -0.07728252,
         0.21497855,  0.32369579],
       [-0.12909589, -0.22685474,  0.02451297, ..., -0.31514002,
         0.09486991,  0.34918982],
       ...,
       [ 0.07565057, -0.42969355, -0.09827704, ..., -0.23712488,
         0.33542192,  0.31270502],
       [-0.45427415, -0.10068685, -0.08600654, ..., -0.41354904,
        -0.25940723,  0.41266429],
       [-0.06838451, -0.35422466, -0.0555469 , ..., -0.48855583,
        -0.21515214,  0.20441793]])

In [43]:
balanced_cluster_df = bge_hierarchy[bge_hierarchy.Parent_ID.isin(balanced_clusters)]

In [53]:
balanced_cluster_mapping_dict = {row['Parent_ID']: row['Topics'] for index, row in balanced_cluster_df.iterrows()}

### Creating an embedding for each cluster

- Find the embedding of each topic and the number of points per topic
- Do a weighted average of embeddings to find the embedding of the top level cluster

In [121]:
def calculate_embedding_for_balanced_clusters(cluster_topic_mapping, topic_embeddings, topic_sizes):
    cluster_embedding_mappings = {}
    top_level_clusters = list(cluster_topic_mapping.keys())
    embedding_size = len(topic_embeddings[0])
    for cluster in top_level_clusters:
        cluster_size = 0
        cluster_embedding = np.zeros(embedding_size)
        for topic in cluster_topic_mapping[cluster]:
            # cluster_embedding += topic_sizes[topic] * topic_embeddings[topic]
            cluster_embedding += topic_embeddings[topic]
            # cluster_size += topic_sizes[topic]
            cluster_size += 1
        cluster_embedding = cluster_embedding/cluster_size
        cluster_embedding_mappings[cluster] = cluster_embedding
    return cluster_embedding_mappings

In [119]:
def test_calculate_embedding_for_balanced_clusters():
    # Test 1: Basic Functionality
    cluster_topic_mapping = {0: [0, 1]}
    topic_embeddings = {0: np.array([1, 1]), 1: np.array([1, 1])}
    topic_sizes = {0: 1, 1: 1}
    output = calculate_embedding_for_balanced_clusters(cluster_topic_mapping, topic_embeddings, topic_sizes)
    assert np.array_equal(output[0], np.array([1, 1]))

    # Test 2: Multiple topics per cluster
    cluster_topic_mapping = {0: [0, 1]}
    topic_embeddings = {0: np.array([1, 2]), 1: np.array([2, 3])}
    topic_sizes = {0: 1, 1: 1}
    output = calculate_embedding_for_balanced_clusters(cluster_topic_mapping, topic_embeddings, topic_sizes)
    assert np.array_equal(output[0], np.array([1.5, 2.5]))

    # Test 3: Different topic sizes
    cluster_topic_mapping = {0: [0, 1]}
    topic_embeddings = {0: np.array([1, 2]), 1: np.array([2, 3])}
    topic_sizes = {0: 1, 1: 2}
    output = calculate_embedding_for_balanced_clusters(cluster_topic_mapping, topic_embeddings, topic_sizes)
    assert np.allclose(output[0], np.array([1.67, 2.67]), atol=0.01)

    # # Test 4: Empty clusters or topics
    # cluster_topic_mapping = {0: []}
    # topic_embeddings = {}
    # topic_sizes = {}
    # output = calculate_embedding_for_balanced_clusters(cluster_topic_mapping, topic_embeddings, topic_sizes)
    # assert 0 not in output  # No embedding should be created for empty clusters

    print("All tests passed!")

In [120]:
test_calculate_embedding_for_balanced_clusters()

All tests passed!


In [122]:
balanced_cluster_embeddings = calculate_embedding_for_balanced_clusters(cluster_topic_mapping=balanced_cluster_mapping_dict, 
                                          topic_embeddings=ect_bge_small.bertopic_model.topic_embeddings_,
                                         topic_sizes=ect_bge_small.bertopic_model.topic_sizes_)

In [89]:
from sklearn.metrics.pairwise import cosine_similarity

In [127]:
from sklearn.metrics.pairwise import manhattan_distances

In [142]:
dct = {'cluster_id': ['1894','001'],
      'within_dist': ['12.01','22'],
      'outside_dist': ['32','63']}

In [159]:
def calc_cluster_dist_to_internal_external_topics(cluster_mapping_dict, cluster_embeddings, topic_embeddings, metric_type = 'cosine'):
    
    def calc_distance_between_embeddings(e1, e2):
        if metric_type == 'cosine':
            return cosine_similarity(e1.reshape(1,-1),e2.reshape(1,-1))[0][0]
        elif metric_type == 'manhattan':
            return manhattan_distances(e1.reshape(1,-1),e2.reshape(1,-1))[0][0]
        
    def calculate_avg_embedding_distance_to_topics(cluster_embedding, topic_list, topic_embeddings):
        dist = 0
        for topic in topic_list:
            dist += calc_distance_between_embeddings(cluster_embedding,topic_embeddings[topic])
        avg_dist = dist/len(topic_list)
        return avg_dist
        
    clusters = []
    internal_avg_dist = []
    external_avg_dist = []
    for cluster in list(cluster_mapping_dict.keys()):
        internal_topics = cluster_mapping_dict[cluster]
        external_topics = [idx for idx in range(len(topic_embeddings)) if idx not in internal_topics]
        avg_internal_dist_to_cluster = calculate_avg_embedding_distance_to_topics(cluster_embedding=cluster_embeddings[cluster],
                                                  topic_list=internal_topics,
                                                  topic_embeddings=topic_embeddings)
        avg_external_dist_to_cluster = calculate_avg_embedding_distance_to_topics(cluster_embedding=cluster_embeddings[cluster],
                                          topic_list=external_topics,
                                          topic_embeddings=topic_embeddings)
        clusters.append(cluster)
        internal_avg_dist.append(avg_internal_dist_to_cluster)
        external_avg_dist.append(avg_external_dist_to_cluster)
    
    return pd.DataFrame({'cluster':clusters,'avg_internal_dist':internal_avg_dist,'avg_external_dist':external_avg_dist,'metric':metric_type})

In [157]:
cluster_dist_df = calc_cluster_dist_to_internal_external_topics(cluster_mapping_dict=balanced_cluster_mapping_dict,
                                             cluster_embeddings=balanced_cluster_embeddings,
                                             topic_embeddings=ect_bge_small.bertopic_model.topic_embeddings_,
                                                               metric_type='manhattan')

In [158]:
cluster_dist_df

Unnamed: 0,cluster,avg_internal_dist,avg_external_dist,metric
0,1879,52.65903,51.636442,manhattan
1,1876,53.926575,50.710567,manhattan
2,1866,49.091567,50.983998,manhattan
3,1865,48.399433,53.083837,manhattan
4,1864,50.235918,50.9167,manhattan
5,1863,48.578503,51.161086,manhattan
6,1860,48.872054,51.135667,manhattan
7,1859,48.861626,51.173745,manhattan
8,1849,48.097671,51.011006,manhattan
9,1846,49.596567,51.290958,manhattan


In [87]:
balanced_cluster_embeddings

{'1879': array([-2.44659218e-01, -1.01923618e-01, -1.16392809e-01, -1.58716622e-01,
         7.57260473e-02,  1.68122763e-01,  3.18022083e-01,  2.06535104e-01,
        -1.81305221e-02, -1.51357941e-02,  1.30032592e-01, -3.58643962e-01,
         2.04730699e-01,  2.44755915e-01,  1.58592682e-01, -4.21267864e-02,
        -9.21571357e-03, -1.80453824e-01, -2.36213773e-01,  3.65622342e-01,
         2.51913819e-01, -2.07567801e-01, -7.50876206e-02, -4.26076399e-01,
         2.56536562e-01,  1.08543936e-01, -3.07217021e-02, -3.49655557e-01,
        -3.39404342e-01, -1.70134872e+00,  1.41704580e-01, -2.62004165e-01,
         1.59992352e-01, -1.73427806e-01, -1.87668249e-01, -7.39073981e-02,
        -2.19997878e-01,  2.97064383e-01,  2.70545627e-02, -4.54106735e-03,
        -1.61184355e-01,  1.76082697e-01,  9.33720111e-03, -2.27398097e-01,
        -2.02120448e-01, -2.29388080e-01, -1.81906575e-01, -6.06467847e-03,
         1.53276849e-01, -1.44278167e-01,  3.22101633e-01, -3.62738537e-02,
    

In [172]:
cosine_similarity_matrix = cosine_similarity(list(balanced_cluster_embeddings.values()))
manhattan_distance_matrix = manhattan_distances(list(balanced_cluster_embeddings.values()))

In [165]:
len(similarity_matrix[0])

36

In [167]:
# Convert dictionary to list and keep track of keys (cluster IDs)
cluster_ids = list(balanced_cluster_embeddings.keys())
embeddings_list = [balanced_cluster_embeddings[cluster_id] for cluster_id in list(balanced_cluster_embeddings.keys())]

In [174]:
cosine_sim_df = pd.DataFrame(similarity_matrix, index=cluster_ids, columns=cluster_ids)
manhattan_dist_df = pd.DataFrame(manhattan_distance_matrix, index=cluster_ids, columns=cluster_ids)

In [181]:
manhattan_dist_df

Unnamed: 0,1879,1876,1866,1865,1864,1863,1860,1859,1849,1846,...,1737,1724,1607,1496,1285,1209,1205,1130,1115,1101
1879,0.0,11.630096,14.493501,18.947385,14.39013,15.966011,14.888051,15.945691,13.210536,12.967184,...,15.340656,17.575663,21.647161,17.924276,20.681493,24.175877,29.427538,27.577283,32.625487,29.494876
1876,11.630096,0.0,11.054044,17.313771,11.690858,14.352954,13.678311,14.401018,8.719037,10.327059,...,14.075006,15.748293,19.797966,15.64123,18.32691,22.896431,25.55013,25.790564,32.699409,26.490696
1866,14.493501,11.054044,0.0,17.25489,10.670208,11.706075,10.454599,12.064553,10.132692,11.558878,...,13.45707,15.698903,17.842187,17.544084,15.654221,19.493958,22.554085,28.004373,31.065442,22.846399
1865,18.947385,17.313771,17.25489,0.0,19.861345,18.944362,18.168958,19.969792,17.911156,17.821307,...,19.296648,21.758056,25.461548,23.12401,21.281339,24.056027,27.369518,33.068779,34.598019,25.369496
1864,14.39013,11.690858,10.670208,19.861345,0.0,10.515641,11.032955,10.193201,9.98812,11.703294,...,15.88691,17.362668,19.943281,16.953319,16.870071,22.223452,23.570305,27.827672,30.468595,25.156599
1863,15.966011,14.352954,11.706075,18.944362,10.515641,0.0,11.11526,10.435063,12.430018,13.570659,...,15.178757,17.9936,21.351988,17.905217,16.729228,22.117612,24.573126,29.827329,28.258201,24.069909
1860,14.888051,13.678311,10.454599,18.168958,11.032955,11.11526,0.0,10.047144,12.697806,14.167119,...,14.818649,16.311287,20.047754,18.777164,16.573283,20.794477,22.039886,29.874087,30.725693,23.108557
1859,15.945691,14.401018,12.064553,19.969792,10.193201,10.435063,10.047144,0.0,12.962947,14.17652,...,16.508327,17.617954,20.019581,18.940115,17.617799,22.605711,23.877814,29.669614,29.773465,24.723327
1849,13.210536,8.719037,10.132692,17.911156,9.98812,12.430018,12.697806,12.962947,0.0,10.604591,...,14.016253,16.920264,18.471413,16.360433,17.508816,22.629648,23.695403,26.628065,31.221462,24.825544
1846,12.967184,10.327059,11.558878,17.821307,11.703294,13.570659,14.167119,14.17652,10.604591,0.0,...,16.127956,18.499883,19.492477,16.667362,18.830753,24.205469,25.407863,25.818535,34.686895,27.049819


### Summary

- Avg cluster distance to internal topics and external topics are very similar 
- This is more so using cosine, while there is slight difference using manhattan (which is the distance metric used in the custom umap model)
- the inter-cluster distances are about 10-20 using manhattan, but very close to each other using cosine similarity

### Plotting of clusters using UMAP

In [191]:
import plotly.express as px

In [239]:
import umap
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Assuming cluster_embeddings is your dictionary containing cluster ids and their embeddings

# Convert dictionary to a list and keep track of the labels
clusters, embeddings = zip(*balanced_cluster_embeddings.items())
embeddings = np.array(embeddings)

# Reduce dimensionality with UMAP
reducer = umap.UMAP(n_components=3,metric = 'manhattan')
embeddings_3d = reducer.fit_transform(embeddings)

# Create a DataFrame for easier plotting
# df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
df['cluster_id'] = clusters

fig = px.scatter_3d(df, x='x', y='y', z='z', color='cluster_id', text='cluster_id')
fig.show()

In [206]:
manhattan_dist_df['1737']['1866']

13.457070112346667

In [207]:
manhattan_dist_df['1737']['1741']

19.557891340898735

#### Observations

- The representation of embeddings in the 3 dimensions are not really represenative
- We can observe instances where the visually closer clusters actually have higher distance than farther ones (like 1101, 1115 & 1101, 1790)
- but in many other cases it is representative

### User Representation as an embedding

In [211]:
user_selections = np.random.choice(list(balanced_cluster_embeddings.keys()),size= 5)

In [212]:
user_selections

array(['1496', '1809', '1285', '1101', '1790'], dtype='<U4')

In [238]:
fig = px.scatter_3d(df[df.cluster_id.isin(user_selections)], x='x', y='y', z='z', color='cluster_id', text='cluster_id')
fig.show()

In [219]:
user_embedding = np.mean([balanced_cluster_embeddings[a] for a in user_selections],axis = 0)

In [220]:
user_embedding

array([-2.07276369e-01, -1.86723157e-01,  7.79418283e-03, -1.04073014e-01,
        1.67860993e-01,  1.38818730e-01,  2.79725477e-01,  2.56273865e-01,
        1.84514202e-02, -3.93223732e-02,  1.49221140e-01, -2.70369794e-01,
        1.42469848e-01,  2.64898933e-01,  1.07005567e-01, -1.10378288e-02,
        2.48175922e-02, -3.19430678e-01, -3.21237152e-01,  2.86915371e-01,
        2.85393484e-01, -2.61446781e-01, -8.08351660e-02, -3.49510923e-01,
        2.64901297e-01,  7.54123821e-02, -9.93723125e-02, -2.36707052e-01,
       -3.36133199e-01, -1.73432450e+00,  2.20172341e-02, -2.29125968e-01,
        2.68205384e-01, -8.89783424e-02, -1.68518792e-01, -4.74730395e-02,
       -2.19302212e-01,  2.62048496e-01, -1.27103838e-03,  8.12927029e-02,
       -9.51722666e-02,  2.35804983e-01, -3.69523594e-02, -2.08503305e-01,
       -1.43576290e-01, -1.85382976e-01, -1.47485067e-01, -2.62426979e-02,
        2.57056023e-01, -9.20783986e-02,  2.49664877e-01, -1.34997246e-01,
        1.07646998e-01,  

In [225]:
user_distance_dist = {cluster_id: manhattan_distances(user_embedding.reshape(1,-1),balanced_cluster_embeddings[cluster_id].reshape(1,-1))[0][0] for cluster_id in balanced_cluster_embeddings.keys()}

In [236]:
np.mean([user_distance_dist[x] for x in user_selections])

14.342342890608922

In [237]:
np.mean([user_distance_dist[x] for x in balanced_cluster_embeddings.keys() if x not in user_selections])

14.88557550333404

In [232]:
user_distance_dist

{'1879': 14.599250861413925,
 '1876': 10.692957164579337,
 '1866': 8.8251342687513,
 '1865': 17.52164232206409,
 '1864': 9.43552442875954,
 '1863': 10.698302586721791,
 '1860': 10.448683117510285,
 '1859': 11.446369997500765,
 '1849': 9.563887127260035,
 '1846': 11.510300255877732,
 '1840': 10.177777647167696,
 '1837': 14.976650984821223,
 '1834': 11.273026519978034,
 '1832': 9.515031429265893,
 '1815': 12.910310560611007,
 '1809': 10.193830429674028,
 '1795': 16.853679315426376,
 '1790': 14.154353683149054,
 '1781': 13.12648167554495,
 '1776': 10.39233775984451,
 '1772': 16.33310122068577,
 '1760': 17.039389620748405,
 '1759': 16.079921962118352,
 '1757': 8.405692149960002,
 '1748': 17.182665695731878,
 '1741': 18.635405147890765,
 '1737': 14.347568125865473,
 '1724': 16.532592040340475,
 '1607': 19.336693082229505,
 '1496': 14.776970292891615,
 '1285': 12.871177361032139,
 '1209': 21.67200004681271,
 '1205': 21.448913015334863,
 '1130': 28.25981693322195,
 '1115': 32.21173353931662,


In [231]:
bge_hierarchy[bge_hierarchy.Parent_ID.isin(user_selections)]

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance,Level,num_points,points
862,1809,chatgpt_ai_chatbot_openai_google,"[8, 119, 305, 322, 329, 336, 448, 469, 511, 62...",1691,google_bard_googles_cci_antitrust,1639,chatgpt_ai_openai_chatbot_bing,1.403728,9,679,"[11241, 16085, 17032, 17233, 34517, 42055, 429..."
843,1790,adani_hindenburg_ports_enterprises_group,"[63, 210, 233, 263, 282, 351, 379, 399, 428, 4...",1407,ports_port_sonowal_cargo_adani,1645,adani_hindenburg_enterprises_group_fpo,1.324405,8,511,"[6670, 7481, 7537, 7750, 8709, 8896, 11292, 16..."
549,1496,mutual_funds_sip_fund_returns,"[73, 143, 172, 173, 234, 260, 394, 409, 601, 7...",1484,mutual_funds_sip_fund_returns,601,elss_taxsaving_saving_schemes_returns,0.941064,7,513,"[1221, 1372, 1416, 1554, 1609, 1624, 1649, 166..."
338,1285,diesel_petrol_retailing_litre_cities,"[346, 465, 493, 532, 536, 744, 813, 863]",863,diesel_petrol_tonnes_consumed_sales,998,petrol_diesel_retailing_litre_cities,0.79416,3,173,"[523, 735, 8884, 11178, 13254, 18570, 34109, 3..."
154,1101,fixed_fd_deposit_rates_deposits,"[79, 86, 441, 905]",1052,fd_rates_fixed_deposits_lender,1023,fixed_deposit_fd_citizens_senior,0.595617,10,242,"[3628, 3721, 3975, 4132, 4794, 5252, 7286, 729..."


In [235]:
bge_hierarchy[bge_hierarchy.Parent_ID == '1849']

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance,Level,num_points,points
902,1849,insurance_multibagger_dividend_fiis_shares,"[16, 17, 24, 43, 78, 80, 114, 141, 149, 151, 1...",1812,insurance_life_lic_insurers_irdai,1802,multibagger_dividend_fiis_shares_smallcap,1.599241,12,2490,"[2555, 10351, 11354, 28943, 29184, 29602, 2975..."


#### Summary

- By selecting a set of 5 random clusters, the user's embedding is infact closer to a few other clusters than it is to any of these clusters
- It could be likely because the user selected clusters are quite far from each other in the embedding space
- Overall, embedding space representation for users doesn't seem to be the right approach as it doesn't capture the rank order preferences of users
- **As a result, we'll go ahead with TS based representation of cluster preferences for users as well**

#### Secondary outcome

- One advantage of embedding space representation is that immediate distances to every other clusters are instantaneous (albeit inaccurate)
- To extend that ability for TS based representation, we can create a correlation matrix across clusters and derive implicit preferences across clusters which are not declared explicitly by the user even at the time of onboarding