## Objective

1. Creating balanced clusters along with outliers in HDBSCAN. 
2. Evaluating merging of smaller clusters into one of the bigger clusters
3. Evaluation of outlier reduction by assigning to a top level cluster

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from copy import deepcopy
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from datasets import load_dataset
from umap import UMAP
import re
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import plotly.io as pio
pio.renderers.default = 'iframe'

In [26]:
class EmbeddingsClusterTopics:
    def __init__(self, model_name, dataset_path, documents_column_name, embeddings_column_name, clustering_type = 'hdbscan', random_state = None):
        self.model_name = model_name
        self.embeddings_model = SentenceTransformer(self.model_name)
        custom_umap_model = UMAP(n_neighbors=15, n_components=10, random_state=random_state)  # Change 10 to the desired number of dimensions
        if clustering_type == 'hdbscan':
            custom_hdbscan_model = HDBSCAN(metric = 'manhattan')
        elif clustering_type == 'kmeans':
            custom_hdbscan_model = KMeans(n_clusters=25, random_state=random_state)
        vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
        representation_model = KeyBERTInspired()
        self.bertopic_model = BERTopic(
                                       # representation_model=representation_model,
                                      calculate_probabilities = True)
        self.dataset = load_dataset('parquet',data_files =dataset_path)['train']
        self.documents = self._load_documents_from_parquet(documents_column_name)
        self.documents = [self._remove_numeric_words(doc) for doc in self.documents]
        self.embeddings = self._load_embeddings_from_parquet(embeddings_column_name)
        self.create_clusters_topics()
        self.generate_topic_names()

    def _remove_numeric_words(self, text):
        # Remove currency-based numbers like $123.1, currency symbols like €, and rupee symbol ₹
        currency_pattern = r'\$\s*\d+(\.\d+)?|\€\s*\d+(\.\d+)?|₹\s*\d+(\.\d+)?'

        # Match numeric words or currency-based numbers
        numeric_pattern = r'\b\d+(\.\d+)?\b'

        # Combine both patterns using negative lookahead to exclude percentages
        combined_pattern = rf'(?!(?:\d+(\.\d+)?%))({currency_pattern}|{numeric_pattern})'

        cleaned_text = re.sub(combined_pattern, '', text)
        return cleaned_text

    def _load_embeddings_from_parquet(self, embeddings_column_name):
        return np.array(self.dataset[embeddings_column_name])

    def _load_documents_from_parquet(self, documents_column_name):
        return self.dataset[documents_column_name]

    def create_clusters_topics(self):
        topics, _ = self.bertopic_model.fit_transform(documents = self.documents, embeddings = self.embeddings)
        self.hierarchical_topics = self.bertopic_model.hierarchical_topics(self.documents)

    def generate_topic_names(self):
        return self.bertopic_model.generate_topic_labels(nr_words=5, separator=", ")

    def reduce_clusters(self):
        return self.bertopic_model.reduce_topics(self.documents, nr_topics = 12)

In [27]:
folder_path = '/Users/ravi.tej/Desktop/ML/Recommendations/Embedding Model Selection/Embeddings/'
file = 'formatted_articles_data_2023_embeddings_bge_small_en.parquet'
ect_bge_small = EmbeddingsClusterTopics(model_name = 'BAAI/bge-small-en', 
                                        dataset_path = folder_path + file, 
                                        documents_column_name = 'title_summary',
                                        embeddings_column_name = 'embeddings', 
                                        clustering_type='hdbscan',
                                        random_state=86)

100%|██████████| 946/946 [00:05<00:00, 184.44it/s]


In [7]:
class TopicHierarchy:
    def __init__(self, df, topic_to_doc_indices):
        self.df = df
        # self.levels = {}
        self.raw_leaf_points_count = {}
        self.raw_leaf_points_list = {}
        self.topic_to_doc_indices = topic_to_doc_indices

    def compute_levels(self, parent_id, level, levels):
        levels[parent_id] = level
        children = self.df[self.df['Parent_ID'] == parent_id]

        for _, child in children.iterrows():
            self.compute_levels(child['Child_Left_ID'], level + 1, levels)
            self.compute_levels(child['Child_Right_ID'], level + 1, levels)

    def compute_raw_leaf_points(self, parent_id):
        if parent_id in self.raw_leaf_points_count:
            return self.raw_leaf_points_count[parent_id], self.raw_leaf_points_list[parent_id]

        children = self.df[self.df['Parent_ID'] == parent_id]

        if children.empty:
            parent_id_int = int(parent_id)
            if parent_id_int in self.topic_to_doc_indices:
                doc_indices = [idx for idx, x in enumerate(self.topic_to_doc_indices) if x == parent_id_int]
                count = len(doc_indices)
            else:
                count = 0
                doc_indices = []
            self.raw_leaf_points_count[parent_id] = count
            self.raw_leaf_points_list[parent_id] = doc_indices
            return count, doc_indices

        total_leaf_points = 0
        all_leaf_points = []

        for _, child in children.iterrows():
            left_count, left_list = self.compute_raw_leaf_points(child['Child_Left_ID'])
            right_count, right_list = self.compute_raw_leaf_points(child['Child_Right_ID'])

            total_leaf_points += left_count + right_count
            all_leaf_points.extend(left_list)
            all_leaf_points.extend(right_list)

        self.raw_leaf_points_count[parent_id] = total_leaf_points
        self.raw_leaf_points_list[parent_id] = all_leaf_points

        return total_leaf_points, all_leaf_points

    def get_levels(self):
        levels = {}
        all_child_ids = set(self.df['Child_Left_ID']).union(set(self.df['Child_Right_ID']))
        roots = self.df[~self.df['Parent_ID'].isin(all_child_ids)]
        for _, root in roots.iterrows():
            self.compute_levels(root['Parent_ID'], 0, levels)
        return levels

    def get_raw_leaf_points(self):
        roots = self.df[~self.df['Parent_ID'].isin(self.df['Child_Left_ID']) & ~self.df['Parent_ID'].isin(self.df['Child_Right_ID'])]
        for _, root in roots.iterrows():
            self.compute_raw_leaf_points(root['Parent_ID'])

        return self.raw_leaf_points_count, self.raw_leaf_points_list

In [9]:
def get_balanced_clusters(df, parent_id, max_points=4000):
    balanced_clusters = []
    cluster_row = df[df['Parent_ID'] == parent_id].iloc[0]
    num_points = cluster_row['num_points']

    if num_points <= max_points:
        balanced_clusters.append(cluster_row['Parent_ID'])
        return balanced_clusters

    children = df[df['Parent_ID'] == parent_id]

    for _, child in children.iterrows():
        balanced_clusters += get_balanced_clusters(df, child['Child_Left_ID'], max_points)
        balanced_clusters += get_balanced_clusters(df, child['Child_Right_ID'], max_points)

    return balanced_clusters

In [85]:
!pip install dill



In [87]:
import dill

In [88]:
with open("ect_bge_small.dill", "wb") as f:
    dill.dump(ect_bge_small, f)

In [28]:
bge_hierarchy = deepcopy(ect_bge_small.hierarchical_topics)

In [29]:
bge_topic_hierarchy = TopicHierarchy(bge_hierarchy, ect_bge_small.bertopic_model.topics_)
bge_levels = bge_topic_hierarchy.get_levels()
num_points, points = bge_topic_hierarchy.get_raw_leaf_points()

# Updating DataFrame as before
bge_hierarchy['Level'] = bge_hierarchy['Parent_ID'].map(bge_levels)
bge_hierarchy['num_points'] = bge_hierarchy['Parent_ID'].map(num_points)
bge_hierarchy['points'] = bge_hierarchy['Parent_ID'].map(points)

In [37]:
len([idx for idx, i in enumerate(ect_bge_small.bertopic_model.topics_) if i == -1])

25063

In [72]:
balanced_clusters = get_balanced_clusters(bge_hierarchy,parent_id = '1892', max_points = 3000)

In [73]:
len(balanced_clusters)

36

In [74]:
outlier_cluster_df = assign_outliers_to_balanced_clusters(bge_hierarchy[bge_hierarchy.Parent_ID.isin(balanced_clusters)],
                                                         document_topic=ect_bge_small.bertopic_model.topics_,
                                                         probabilities=ect_bge_small.bertopic_model.probabilities_)

In [75]:
outlier_cluster_df

Unnamed: 0,OriginalIndex,NewCluster,TotalProbability
0,4,1776,0.155119
1,5,1757,0.014368
2,6,1834,0.121728
3,8,1849,0.010529
4,10,1876,0.012001
...,...,...,...
25058,67925,1864,0.111744
25059,67926,1834,0.018266
25060,67930,1757,0.149894
25061,67931,1757,0.114813


In [51]:
len(ect_bge_small.bertopic_model.topic_labels_)

948

In [57]:
ect_bge_small.documents[4]

'Who is Piyush Gupta, the CEO of DBS Group who earns Rs  lakh per day? Know about his education, family, net worth & more DBS Group chief executive Piyush Gupta saw his annual earnings climb  per cent to SGD  million in '

In [84]:
outlier_cluster_df.TotalProbability.describe([0.25,0.5,0.6,0.66,0.7,0.75,0.9,0.95,0.99])

count    25063.000000
mean         0.082536
std          0.067160
min          0.000004
25%          0.031188
50%          0.068167
60%          0.084084
66%          0.097148
70%          0.106249
75%          0.117285
90%          0.170139
95%          0.213876
99%          0.308906
max          0.620156
Name: TotalProbability, dtype: float64

In [76]:
outlier_cluster_df.groupby('NewCluster')['NewCluster'].count().nlargest(30)

NewCluster
1757    6803
1849    2519
1863    1906
1776    1872
1832    1730
1846    1446
1864    1310
1859    1300
1860     868
1724     854
1834     666
1876     511
1866     484
1790     456
1809     349
1748     290
1837     289
1496     232
1607     230
1840     202
1795     149
1772     134
1781     124
1759      78
1130      65
1741      57
1815      49
1205      35
1101      20
1865      13
Name: NewCluster, dtype: int64

In [None]:
ect_bge_small.topic_label

In [48]:
np.sum(ect_bge_small.bertopic_model.probabilities_[6])

0.4869679368070318

In [41]:
def assign_outliers_to_balanced_clusters(balanced_clusters_df, document_topic, probabilities):
    # Prepare a topic-cluster mapping for fast lookups
    topic_cluster_map = {}
    for _, row in balanced_clusters_df.iterrows():
        for topic in row['Topics']:
            topic_cluster_map[topic] = row['Parent_ID']

    # Initialize lists to store results
    original_indices = []
    new_clusters = []
    total_probs = []

    # Find the indices of documents that are outliers (-1)
    outlier_indices = np.where(np.array(document_topic) == -1)[0]

    # For each outlier, find the most probable cluster
    for idx in outlier_indices:
        topic_probs = np.array(probabilities[idx])
        cluster_indices = [topic_cluster_map.get(t, -1) for t in range(len(topic_probs))]

        # Create a DataFrame for aggregation
        df = pd.DataFrame({
            'Cluster': cluster_indices,
            'Probability': topic_probs
        })

        # Sum probabilities by cluster
        df_grouped = df.groupby('Cluster').sum()

        # Find the cluster with the maximum total probability
        best_cluster = df_grouped['Probability'].idxmax()

        # Append to lists
        original_indices.append(idx)
        new_clusters.append(best_cluster)
        total_probs.append(df_grouped.loc[best_cluster, 'Probability'])

    # Create a DataFrame for the results
    return pd.DataFrame({
        'OriginalIndex': original_indices,
        'NewCluster': new_clusters,
        'TotalProbability': total_probs
    })

# Uncomment the following lines to test the function
# df_hierarchy = your_df_hierarchy_here
# document_topic = your_document_topic_here
# probabilities = your_probabilities_here
# result_df = assign_outliers_to_clusters(df_hierarchy, document_topic, probabilities)

In [35]:
bge_hierarchy[bge_hierarchy.Parent_ID.isin(balanced_clusters)]

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance,Level,num_points,points
938,1885,closed_yesterdays_stock_reacts_monitor,"[47, 64, 101, 112, 116, 140, 148, 182, 243, 25...",1865,tcs_infosys_lrs_remittance_remittances,1881,closed_yesterdays_stock_reacts_monitor,2.068772,1,3404,"[290, 491, 524, 619, 631, 815, 1339, 1500, 159..."
921,1868,insurance_dividend_crore_cr_net,"[15, 16, 17, 24, 35, 43, 44, 55, 61, 62, 78, 7...",1101,fixed_fd_deposit_rates_deposits,1862,insurance_dividend_crore_net_profit,1.753843,9,4677,"[3628, 3721, 3975, 4132, 4794, 5252, 7286, 729..."
919,1866,nifty_sensex_pts_indices_stocks,"[41, 56, 57, 58, 75, 122, 124, 125, 138, 150, ...",1842,nifty_sgx_stocks_nse_resistance,1672,sensex_pts_nifty_points_indices,1.734695,5,1851,"[1373, 1670, 1772, 1786, 3558, 4327, 15474, 17..."
917,1864,trump_ukraine_covid_biden_president,"[12, 29, 53, 74, 91, 105, 120, 145, 153, 176, ...",1839,trump_ukraine_president_biden_donald,1519,covid_cases_xbb_virus_deaths,1.726429,7,2079,"[1378, 4715, 12398, 15908, 18413, 18995, 19952..."
916,1863,visa_result_college_students_class,"[33, 68, 70, 104, 109, 128, 161, 179, 201, 204...",1836,visa_result_students_class_exam,1814,college_forex_ways_money_your,1.715118,10,1699,"[18837, 22963, 36200, 36722, 39596, 44655, 447..."
914,1861,train_vande_bharat_police_pawar,"[18, 19, 30, 31, 45, 46, 50, 59, 65, 67, 71, 7...",1859,vande_train_bharat_rainfall_air,1832,pawar_police_earthquake_killed_manipur,1.706618,7,4796,"[526, 563, 1010, 1981, 2140, 3164, 5577, 14144..."
913,1860,gold_crude_dollar_inflation_oil,"[20, 21, 25, 40, 42, 51, 60, 69, 76, 77, 98, 1...",1744,crude_oil_barrel_cents_brent,1858,gold_dollar_inflation_rupee_rate,1.690665,9,2601,"[2204, 4171, 9056, 10473, 10995, 11100, 11455,..."
899,1846,maruti_suzuki_hyundai_exshowroom_suv,"[52, 95, 137, 183, 186, 187, 188, 192, 217, 23...",1787,maruti_suzuki_jimny_fronx_toyota,1818,hyundai_hero_honda_kia_exshowroom,1.590759,5,1393,"[8979, 12218, 12316, 14310, 14390, 14399, 1471..."
890,1837,5g_oneplus_galaxy_samsung_nord,"[100, 102, 115, 123, 164, 216, 218, 262, 266, ...",1699,jio_5g_telecom_vodafone_bsnl,1745,oneplus_galaxy_5g_samsung_nord,1.528749,11,868,"[3814, 4119, 4418, 6999, 7555, 9136, 9764, 107..."
887,1834,gst_tax_income_section_sebi,"[0, 22, 27, 32, 36, 147, 163, 178, 180, 184, 2...",1793,sebi_insolvency_nclt_zee_resolution,1810,gst_tax_income_section_itr,1.524643,13,2401,"[7550, 8206, 9489, 9613, 9720, 9988, 10332, 11..."
