In [1]:
import os
import pandas as pd
import pickle

import hdbscan
import pandas as pd

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
root_path = "../data"
experiment_path = os.path.join(root_path,"07_model_output","all-MiniLM-L6-v2","experiment-5b-weighted-similarities-3_title_7_body")
weighted_embeddings_path = os.path.join(root_path, "04_feature", "weighted_embeddings.pkl")
neo4j_predicted_cluster_pkl_path = os.path.join(experiment_path, "neo4j_predicted_clusters.pkl")

with open(weighted_embeddings_path, "rb") as f:
    weighted_embeddings = pickle.load(f)

with open(neo4j_predicted_cluster_pkl_path, "rb") as f:
    neo4j_predicted_cluster_pkl = pickle.load(f)

pred_cluster_df = pd.read_csv(os.path.join(experiment_path, "predicted_cluster.csv"))

In [3]:
cluster_size_count = pred_cluster_df.cluster.value_counts()
to_keep = cluster_size_count[cluster_size_count >10].index
cluster_morethan10 = pred_cluster_df[pred_cluster_df.cluster.isin(to_keep)]
print('No. of cluster to do 2nd level clustering: ', cluster_morethan10.cluster.nunique())

No. of cluster to do 2nd level clustering:  10


In [4]:
cluster_morethan10_embeddings = pd.merge(
    cluster_morethan10,
    weighted_embeddings[['id','vector_extracted_content_body']],
    how='left',
    on='id')

print(cluster_morethan10.shape[0] == cluster_morethan10.shape[0])

True


In [5]:
def get_embeddings(cluster_df):
    embeddings = np.array(cluster_df.vector_extracted_content_body.to_list())
    doc_titles = cluster_df.title.to_list()
    docs = cluster_df.body_content.to_list()
    ids = cluster_df.id.to_list()
    umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
    umap_embeddings = umap_model.fit_transform(embeddings)

    return embeddings, doc_titles, docs, ids, umap_embeddings

In [6]:
def hyperparameter_tuning(embeddings):
    best_score = 0

    for min_cluster_size in [2,3,4,5,6]:
        for min_samples in [1,2,3,4,5,6,7]:
            for cluster_selection_method in ['leaf']:
                for metric in ['euclidean','manhattan']:
                    # for each combination of parameters of hdbscan
                    hdb = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,min_samples=min_samples,
                                        cluster_selection_method=cluster_selection_method, metric=metric, 
                                        gen_min_span_tree=True).fit(embeddings)
                    # DBCV score
                    score = hdb.relative_validity_
                    if score > best_score:
                        best_score = score
                        best_parameters = {'min_cluster_size': min_cluster_size, 
                                'min_samples':  min_samples, 'cluster_selection_method': cluster_selection_method,
                                'metric': metric}

    print("Best DBCV score: {:.3f}".format(best_score))
    print("Best parameters: {}".format(best_parameters))
    return best_parameters

In [7]:
def topic_modelling(hyperparameters):
    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=hyperparameters['min_cluster_size'], min_samples=hyperparameters['min_samples'], metric=hyperparameters['metric'], cluster_selection_method=hyperparameters['cluster_selection_method'], prediction_data=True, gen_min_span_tree=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with 
    representation_model = MaximalMarginalRelevance(diversity=0.3)

    # All steps together
    topic_model = BERTopic(
    # embedding_model=embedding_model,          # Step 1 - Extract embeddings
    # umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics="auto" #default is none, will auto reduce topics using HDBSCAN
    )
    return topic_model

In [8]:
def create_topic_assigner(start_counter):
    counter = start_counter
    
    def assign_new_topic(x):
        nonlocal counter
        if x == -1:
            new_topic = counter
            counter += 1
            return new_topic
        else:
            return x

    return assign_new_topic


def process_cluster(cluster_df):
    # Step 1: Extract embeddings and umap_embeddings
    embeddings, doc_titles, docs, ids, umap_embeddings = get_embeddings(cluster_df)

    # Step 2: Perform hyperparameter tuning for berttopic
    hyperparameters = hyperparameter_tuning(umap_embeddings)

    # Step 3: Create and fit topic model 
    topic_model = topic_modelling(hyperparameters)
    topics, _ = topic_model.fit_transform(docs, embeddings)

    ###############
    # Visualisation 
    ################
    
    # Uncomment and adjust as needed for visualization purposes

    # top_n = 50
    # top_topics = topic_model.get_topic_freq().head(top_n)['Topic'].tolist()

    # reduced_embeddings = topic_model.umap_model.embedding_
    # hover_data = [f"{title} - Topic {topic}" for title, topic in zip(doc_titles, topics)]
    # visualization = topic_model.visualize_documents(hover_data, reduced_embeddings=reduced_embeddings, topics=top_topics, title=f'Top {top_n} Topics') 
    # visualization.show() 

    # visualization_barchart = topic_model.visualize_barchart(top_n_topics=top_n)
    # visualization_barchart.show()

    # Step 4: Create a DataFrame with assigned topics, titles and ids.
    result_df = pd.DataFrame({"Assigned Topic": topics, "Title": doc_titles, "id": ids})
    
    # Step 5: Extract topic information and get top 5 keywords, if article is unclustered where Topic is -1, topic representation/kws will be removed
    topic_kws = topic_model.get_topic_info()[['Topic', 'Representation']]
    topic_kws['top_5_kws'] = topic_kws.apply(lambda row: row['Representation'][:5] if row['Topic'] != -1 else np.nan, axis=1)
    
    # Step 6: Merge results with the top keywords
    result_df_kws = pd.merge(result_df, topic_kws, how='left', left_on='Assigned Topic', right_on='Topic')
    result_df_kws = result_df_kws.drop(['Representation', 'Topic'], axis=1)
    result_df_kws = result_df_kws[['id', 'Title', 'Assigned Topic', 'top_5_kws']]

    # Step 7: Assign new topic numbers to topics that are -1, starting from the max assigned topic in the results_df_kws. 
    max_topic = result_df_kws['Assigned Topic'].max()
    new_topic_counter = max_topic + 1
    assign_new_topic_func = create_topic_assigner(new_topic_counter)
    result_df_kws['Assigned Topic'] = result_df_kws['Assigned Topic'].apply(assign_new_topic_func)

    # Step 8: Update the 'Assigned Topic' column with cluster information to prevent repeat cluster numbers
    cluster_id = cluster_df['cluster'].unique()[0]
    result_df_kws['Assigned Topic'] = result_df_kws['Assigned Topic'].apply(lambda x: 'Cluster_' + str(cluster_id) + '_' + str(x))

    return result_df_kws

def process_all_clusters(cluster_morethan10_embeddings):
    unique_clusters = cluster_morethan10_embeddings['cluster'].unique()
    all_results = []

    for cluster_id in unique_clusters:
        cluster_df = cluster_morethan10_embeddings[cluster_morethan10_embeddings['cluster'] == cluster_id]
        result_df_kws = process_cluster(cluster_df)
        all_results.append(result_df_kws)

    combined_df = pd.concat(all_results, ignore_index=True)
    return combined_df

def assign_unique_numbers_to_topics(final_result_df, pred_cluster_df):
    """
    Assigns unique numbers to each unique 'Assigned Topic' in the final_result_df
    based on the maximum cluster value from the pred_cluster_df.

    Parameters:
    final_result_df (pd.DataFrame): DataFrame containing the final results with an 'Assigned Topic' column.
    pred_cluster_df (pd.DataFrame): DataFrame containing the predicted clusters with a 'cluster' column.

    Returns:
    pd.DataFrame: Updated final_result_df with an additional 'Assigned Topic Number' column.
    """
    max_cluster_value = pred_cluster_df['cluster'].max()
    unique_assigned_topics = final_result_df['Assigned Topic'].unique()
    topic_number_mapping = {topic: idx + max_cluster_value + 1 for idx, topic in enumerate(unique_assigned_topics)}
    
    final_result_df['Assigned Topic Number'] = final_result_df['Assigned Topic'].map(topic_number_mapping)
    return final_result_df

final_result_df = process_all_clusters(cluster_morethan10_embeddings)
final_result_df_with_numbers = assign_unique_numbers_to_topics(final_result_df, pred_cluster_df)

Best DBCV score: 0.258
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV score: 0.420
Best parameters: {'min_cluster_size': 3, 'min_samples': 6, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}
Best DBCV score: 0.403
Best parameters: {'min_cluster_size': 2, 'min_samples': 4, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV score: 0.254
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}
Best DBCV score: 0.324
Best parameters: {'min_cluster_size': 5, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}
Best DBCV score: 0.318
Best parameters: {'min_cluster_size': 3, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV score: 0.910
Best parameters: {'min_cluster_size': 5, 'min_samples': 6, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV sco

In [9]:
new_cluster_to_merge = final_result_df_with_numbers[['id','top_5_kws','Assigned Topic Number']]
new_cluster_to_merge.columns = ['id','cluster_kws','new_cluster']
updated_pred_cluster = pd.merge(pred_cluster_df, new_cluster_to_merge, how='left', on='id')

In [10]:
updated_pred_cluster['new_cluster'] = updated_pred_cluster['new_cluster'].fillna(updated_pred_cluster['cluster']).apply(int)
updated_pred_cluster

Unnamed: 0,id,title,url,body_content,cluster,cluster_kws,new_cluster
0,1445475,Emotional Support for Those Living with HIV,https://www.healthhub.sg/live-healthy/being_a_...,"As you may know, World AIDS Day is not only ab...",0,,0
1,1445725,The Facts of Childhood Fractures,https://www.healthhub.sg/live-healthy/Child-fr...,"What is a Fracture?\nFractures are breaks, cra...",1,,1
2,1445247,30 Ways to Go from Uncle to Hunkle,https://www.healthhub.sg/live-healthy/mens-hea...,Redefine and Reinvent Guys Night Out\nAccordin...,3,,3
3,1445260,"Age is only a Number - Move it, Feel young",https://www.healthhub.sg/live-healthy/move-it-...,"Age is only a Number - Move It, Feel Young!\nD...",5,,5
4,1444610,Slip-Ups: Dos and Dont's,https://www.healthhub.sg/live-healthy/slip-ups...,A \n \nResources for Quitting\nJoin the I Quit...,13,,13
...,...,...,...,...,...,...,...
663,1444872,Q&A: Is My Baby's Spit-Up Normal?,https://www.healthhub.sg/live-healthy/is-my-ba...,By Health Promotion Board in collaboration wit...,658,,658
664,1444991,Baby’s Second Year: Bite-sized Reminders for M...,https://www.healthhub.sg/live-healthy/baby-sec...,Read these next:\n- Your Baby Needs Soft Skill...,659,,659
665,1443375,Stay Active As You Travel,https://www.healthhub.sg/live-healthy/stay-act...,"Instead, pick a picturesque destination with p...",660,,660
666,1445000,Child’s Fourth Year: Bite-sized Reminders for ...,https://www.healthhub.sg/live-healthy/child-fo...,Read these next:\n- Boosting Your Child's Ment...,661,,661


In [11]:
first_level_pred_cluster = pd.DataFrame(neo4j_predicted_cluster_pkl)
first_level_cluster_dict = dict(zip(first_level_pred_cluster['cluster'], first_level_pred_cluster['cluster_keywords']))

mask = updated_pred_cluster['cluster'] == updated_pred_cluster['new_cluster']
updated_pred_cluster.loc[mask, 'cluster_kws'] = updated_pred_cluster.loc[mask, 'cluster_kws'].fillna(
    updated_pred_cluster['cluster'].map(first_level_cluster_dict)
)

# Formatting
updated_pred_cluster.rename(columns={'cluster':'first_level_cluster','new_cluster':'second_level_cluster','cluster_kws':'second_level_cluster_kws'}, inplace=True)
updated_pred_cluster =updated_pred_cluster[['id','title','url','body_content','first_level_cluster','second_level_cluster','second_level_cluster_kws']]

In [12]:
updated_pred_cluster

Unnamed: 0,id,title,url,body_content,first_level_cluster,second_level_cluster,second_level_cluster_kws
0,1445475,Emotional Support for Those Living with HIV,https://www.healthhub.sg/live-healthy/being_a_...,"As you may know, World AIDS Day is not only ab...",0,0,
1,1445725,The Facts of Childhood Fractures,https://www.healthhub.sg/live-healthy/Child-fr...,"What is a Fracture?\nFractures are breaks, cra...",1,1,
2,1445247,30 Ways to Go from Uncle to Hunkle,https://www.healthhub.sg/live-healthy/mens-hea...,Redefine and Reinvent Guys Night Out\nAccordin...,3,3,
3,1445260,"Age is only a Number - Move it, Feel young",https://www.healthhub.sg/live-healthy/move-it-...,"Age is only a Number - Move It, Feel Young!\nD...",5,5,
4,1444610,Slip-Ups: Dos and Dont's,https://www.healthhub.sg/live-healthy/slip-ups...,A \n \nResources for Quitting\nJoin the I Quit...,13,13,"[milestone, worth, 28th, supporter, evouchers]"
...,...,...,...,...,...,...,...
663,1444872,Q&A: Is My Baby's Spit-Up Normal?,https://www.healthhub.sg/live-healthy/is-my-ba...,By Health Promotion Board in collaboration wit...,658,658,
664,1444991,Baby’s Second Year: Bite-sized Reminders for M...,https://www.healthhub.sg/live-healthy/baby-sec...,Read these next:\n- Your Baby Needs Soft Skill...,659,659,
665,1443375,Stay Active As You Travel,https://www.healthhub.sg/live-healthy/stay-act...,"Instead, pick a picturesque destination with p...",660,660,
666,1445000,Child’s Fourth Year: Bite-sized Reminders for ...,https://www.healthhub.sg/live-healthy/child-fo...,Read these next:\n- Boosting Your Child's Ment...,661,661,


In [13]:
adjusted_cluster = updated_pred_cluster[updated_pred_cluster['first_level_cluster'] != updated_pred_cluster['second_level_cluster']]
adjusted_cluster.head()

Unnamed: 0,id,title,url,body_content,first_level_cluster,second_level_cluster,second_level_cluster_kws
49,1437295,Influenza,https://www.healthhub.sg/a-z/diseases-and-cond...,Update: You can book a flu vaccination appoint...,74,665,"[flu, vaccination, symptoms, nose, people]"
50,1437304,"Mumps: Causes, Symptoms, and Treatment",https://www.healthhub.sg/a-z/diseases-and-cond...,Mumps is a highly contagious viral infection t...,74,666,"[mumps, pain, glands, parotid, swelling]"
51,1437510,Avian Influenza H5N1 (Bird Flu),https://www.healthhub.sg/a-z/diseases-and-cond...,"Domestic poultry, including chickens and turke...",74,667,"[avian, influenza, birds, bird, poultry]"
52,1437505,Avian Influenza H7N9 (Bird Flu),https://www.healthhub.sg/a-z/diseases-and-cond...,"There are three types of influenza viruses A, ...",74,667,"[avian, influenza, birds, bird, poultry]"
53,1443987,All You Need to Know About Childhood Immunisat...,https://www.healthhub.sg/live-healthy/all-you-...,By Associate Professor TAN Thiam Chye Head & S...,74,668,"[vaccines, injection, baby, immunisation, vacc..."


In [14]:
updated_pred_cluster.to_csv(os.path.join(experiment_path,"predicted_cluster_2nd_level_clustering.csv"), index=False)