In [140]:
import os
import pandas as pd
import pickle

import hdbscan
import pandas as pd

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

import numpy as np
import pyvis

In [2]:
np.random.seed(1234)

In [18]:
root_path = "../data"
experiment_path = os.path.join(root_path,"07_model_output","all-MiniLM-L6-v2","experiment-5b-weighted-similarities-3_title_7_body")
weighted_embeddings_path = os.path.join(root_path, "04_feature", "weighted_embeddings.pkl")
neo4j_predicted_cluster_pkl_path = os.path.join(experiment_path, "neo4j_predicted_clusters.pkl")

with open(weighted_embeddings_path, "rb") as f:
    weighted_embeddings = pickle.load(f)

with open(neo4j_predicted_cluster_pkl_path, "rb") as f:
    neo4j_predicted_cluster_pkl = pickle.load(f)

pred_cluster_df = pd.read_csv(os.path.join(experiment_path, "predicted_cluster.csv"))

In [19]:
cluster_size_count = pred_cluster_df.cluster.value_counts()
to_keep = cluster_size_count[cluster_size_count >10].index
cluster_morethan10 = pred_cluster_df[pred_cluster_df.cluster.isin(to_keep)]
print('No. of cluster to do 2nd level clustering: ', cluster_morethan10.cluster.nunique())

No. of cluster to do 2nd level clustering:  10


In [20]:
cluster_morethan10_embeddings = pd.merge(
    cluster_morethan10,
    weighted_embeddings[['id','vector_extracted_content_body']],
    how='left',
    on='id')

print(cluster_morethan10.shape[0] == cluster_morethan10_embeddings.shape[0])

True


In [21]:
def get_embeddings(cluster_df):
    embeddings = np.array(cluster_df.vector_extracted_content_body.to_list())
    doc_titles = cluster_df.title.to_list()
    docs = cluster_df.body_content.to_list()
    ids = cluster_df.id.to_list()
    umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    umap_embeddings = umap_model.fit_transform(embeddings)

    return embeddings, doc_titles, docs, ids, umap_embeddings

In [22]:
def hyperparameter_tuning(embeddings):
    best_score = 0

    for min_cluster_size in [2,3,4,5,6]:
        for min_samples in [1,2,3,4,5,6,7]:
            for cluster_selection_method in ['leaf']:
                for metric in ['euclidean','manhattan']:
                    # for each combination of parameters of hdbscan
                    hdb = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,min_samples=min_samples,
                                        cluster_selection_method=cluster_selection_method, metric=metric, 
                                        gen_min_span_tree=True).fit(embeddings)
                    # DBCV score
                    score = hdb.relative_validity_
                    if score > best_score:
                        best_score = score
                        best_parameters = {'min_cluster_size': min_cluster_size, 
                                'min_samples':  min_samples, 'cluster_selection_method': cluster_selection_method,
                                'metric': metric}

    print("Best DBCV score: {:.3f}".format(best_score))
    print("Best parameters: {}".format(best_parameters))
    return best_parameters

In [23]:
def topic_modelling(hyperparameters):
    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=hyperparameters['min_cluster_size'], min_samples=hyperparameters['min_samples'], metric=hyperparameters['metric'], cluster_selection_method=hyperparameters['cluster_selection_method'], prediction_data=True, gen_min_span_tree=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with 
    representation_model = MaximalMarginalRelevance(diversity=0.3)

    # All steps together
    topic_model = BERTopic(
    # embedding_model=embedding_model,          # Step 1 - Extract embeddings
    # umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics="auto" #default is none, will auto reduce topics using HDBSCAN
    )
    return topic_model

In [24]:
def create_topic_assigner(start_counter):
    counter = start_counter
    
    def assign_new_topic(x):
        nonlocal counter
        if x == -1:
            new_topic = counter
            counter += 1
            return new_topic
        else:
            return x

    return assign_new_topic


def process_cluster(cluster_df):
    # Step 1: Extract embeddings and umap_embeddings
    embeddings, doc_titles, docs, ids, umap_embeddings = get_embeddings(cluster_df)

    # Step 2: Perform hyperparameter tuning for berttopic
    hyperparameters = hyperparameter_tuning(umap_embeddings)

    # Step 3: Create and fit topic model 
    topic_model = topic_modelling(hyperparameters)
    topics, _ = topic_model.fit_transform(docs, embeddings)

    ###############
    # Visualisation 
    ################
    
    # Uncomment and adjust as needed for visualization purposes

    # top_n = 50
    # top_topics = topic_model.get_topic_freq().head(top_n)['Topic'].tolist()

    # reduced_embeddings = topic_model.umap_model.embedding_
    # hover_data = [f"{title} - Topic {topic}" for title, topic in zip(doc_titles, topics)]
    # visualization = topic_model.visualize_documents(hover_data, reduced_embeddings=reduced_embeddings, topics=top_topics, title=f'Top {top_n} Topics') 
    # visualization.show() 

    # visualization_barchart = topic_model.visualize_barchart(top_n_topics=top_n)
    # visualization_barchart.show()

    # Step 4: Create a DataFrame with assigned topics, titles and ids.
    result_df = pd.DataFrame({"Assigned Topic": topics, "Title": doc_titles, "id": ids})
    
    # Step 5: Extract topic information and get top 5 keywords, if article is unclustered where Topic is -1, topic representation/kws will be removed
    topic_kws = topic_model.get_topic_info()[['Topic', 'Representation']]
    topic_kws['top_5_kws'] = topic_kws.apply(lambda row: row['Representation'][:5] if row['Topic'] != -1 else np.nan, axis=1)
    
    # Step 6: Merge results with the top keywords
    result_df_kws = pd.merge(result_df, topic_kws, how='left', left_on='Assigned Topic', right_on='Topic')
    result_df_kws = result_df_kws.drop(['Representation', 'Topic'], axis=1)
    result_df_kws = result_df_kws[['id', 'Title', 'Assigned Topic', 'top_5_kws']]

    # Step 7: Assign new topic numbers to topics that are -1, starting from the max assigned topic in the results_df_kws. 
    max_topic = result_df_kws['Assigned Topic'].max()
    new_topic_counter = max_topic + 1
    assign_new_topic_func = create_topic_assigner(new_topic_counter)
    result_df_kws['Assigned Topic'] = result_df_kws['Assigned Topic'].apply(assign_new_topic_func)

    # Step 8: Update the 'Assigned Topic' column with cluster information to prevent repeat cluster numbers
    cluster_id = cluster_df['cluster'].unique()[0]
    result_df_kws['Assigned Topic'] = result_df_kws['Assigned Topic'].apply(lambda x: 'Cluster_' + str(cluster_id) + '_' + str(x))

    return result_df_kws

def process_all_clusters(cluster_morethan10_embeddings):
    unique_clusters = cluster_morethan10_embeddings['cluster'].unique()
    all_results = []

    for cluster_id in unique_clusters:
        cluster_df = cluster_morethan10_embeddings[cluster_morethan10_embeddings['cluster'] == cluster_id]
        result_df_kws = process_cluster(cluster_df)
        all_results.append(result_df_kws)

    combined_df = pd.concat(all_results, ignore_index=True)
    return combined_df

def assign_unique_numbers_to_topics(final_result_df, pred_cluster_df):
    """
    Assigns unique numbers to each unique 'Assigned Topic' in the final_result_df
    based on the maximum cluster value from the pred_cluster_df.

    Parameters:
    final_result_df (pd.DataFrame): DataFrame containing the final results with an 'Assigned Topic' column.
    pred_cluster_df (pd.DataFrame): DataFrame containing the predicted clusters with a 'cluster' column.

    Returns:
    pd.DataFrame: Updated final_result_df with an additional 'Assigned Topic Number' column.
    """
    max_cluster_value = pred_cluster_df['cluster'].max()
    unique_assigned_topics = final_result_df['Assigned Topic'].unique()
    topic_number_mapping = {topic: idx + max_cluster_value + 1 for idx, topic in enumerate(unique_assigned_topics)}
    
    final_result_df['Assigned Topic Number'] = final_result_df['Assigned Topic'].map(topic_number_mapping)
    return final_result_df

final_result_df = process_all_clusters(cluster_morethan10_embeddings)
final_result_df_with_numbers = assign_unique_numbers_to_topics(final_result_df, pred_cluster_df)

Best DBCV score: 0.148
Best parameters: {'min_cluster_size': 3, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV score: 0.675
Best parameters: {'min_cluster_size': 3, 'min_samples': 5, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}
Best DBCV score: 0.451
Best parameters: {'min_cluster_size': 2, 'min_samples': 4, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}
Best DBCV score: 0.271
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}
Best DBCV score: 0.312
Best parameters: {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV score: 0.231
Best parameters: {'min_cluster_size': 3, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV score: 0.960
Best parameters: {'min_cluster_size': 4, 'min_samples': 6, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}
Best DBCV sco

In [149]:
new_cluster_to_merge = final_result_df_with_numbers[['id','top_5_kws','Assigned Topic Number']]
new_cluster_to_merge.columns = ['id','cluster_kws','new_cluster']
updated_pred_cluster = pd.merge(pred_cluster_df, new_cluster_to_merge, how='left', on='id')

In [150]:
updated_pred_cluster['new_cluster'] = updated_pred_cluster['new_cluster'].fillna(updated_pred_cluster['cluster']).apply(int)
updated_pred_cluster.head(1)

Unnamed: 0,id,title,url,body_content,cluster,cluster_kws,new_cluster
0,1445475,Emotional Suppor...,https://www.heal...,"As you may know,...",0,,0


In [27]:
first_level_pred_cluster = pd.DataFrame(neo4j_predicted_cluster_pkl)
first_level_cluster_dict = dict(zip(first_level_pred_cluster['cluster'], first_level_pred_cluster['cluster_keywords']))

mask = updated_pred_cluster['cluster'] == updated_pred_cluster['new_cluster']
updated_pred_cluster.loc[mask, 'cluster_kws'] = updated_pred_cluster.loc[mask, 'cluster_kws'].fillna(
    updated_pred_cluster['cluster'].map(first_level_cluster_dict)
)

# Formatting
updated_pred_cluster.rename(columns={'cluster':'first_level_cluster','new_cluster':'second_level_cluster','cluster_kws':'second_level_cluster_kws'}, inplace=True)
updated_pred_cluster =updated_pred_cluster[['id','title','url','body_content','first_level_cluster','second_level_cluster','second_level_cluster_kws']]

In [29]:
updated_pred_cluster.to_csv(os.path.join(experiment_path,"predicted_cluster_2nd_level_clustering.csv"), index=False)

### Evaluation

In [145]:
pd.set_option('display.max_colwidth',20)
updated_pred_cluster = pd.read_csv(os.path.join(experiment_path, "predicted_cluster_2nd_level_clustering.csv"))
broken_down_groups = updated_pred_cluster[updated_pred_cluster['first_level_cluster'] != updated_pred_cluster['second_level_cluster']]
broken_down_groups['second_level_cluster:kws'] = broken_down_groups.apply(lambda x: str(x['second_level_cluster']) + ' : ' + str(x['second_level_cluster_kws']), axis=1)
agg_result = broken_down_groups.groupby('first_level_cluster').agg(
    number_of_articles_in_first_level = ('first_level_cluster','size'),
    number_of_clusters=('second_level_cluster_kws', 'nunique'),
    number_of_single_articles=('second_level_cluster_kws', lambda x: x.isna().sum()),
    second_level_cluster_article_counts=('second_level_cluster', lambda x: [v for v in x.value_counts().to_dict().values() if v > 1])
    # second_level_clusters = ('second_level_cluster:kws',set)
)
agg_result

agg_result["OG_keywords"] = agg_result.index.map(first_level_cluster_dict)
agg_result['OG_cluster'] = agg_result.index.astype(str) + ' - ' + agg_result['OG_keywords'].astype(str)
agg_result =agg_result[['OG_cluster','number_of_articles_in_first_level','number_of_clusters','second_level_cluster_article_counts','number_of_single_articles']]
agg_result


Unnamed: 0_level_0,OG_cluster,number_of_articles_in_first_level,number_of_clusters,second_level_cluster_article_counts,number_of_single_articles
first_level_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
74,"74 - ['flu', 'in...",13,2,"[5, 3]",5
117,117 - ['toddler'...,63,5,"[24, 10, 9, 8, 7]",5
171,171 - ['nutritio...,50,5,"[12, 10, 8, 5, 5]",10
195,"195 - ['stress',...",12,2,"[7, 5]",0
196,196 - ['protein'...,35,3,"[17, 6, 5]",7
275,275 - ['resilien...,13,2,"[7, 6]",0
461,461 - ['aerobic'...,53,2,"[37, 16]",0
482,"482 - ['teeth', ...",21,4,"[5, 3, 3, 2]",8
606,"606 - ['quit', '...",39,7,"[9, 4, 4, 3, 3, ...",11
629,629 - ['wholegra...,27,3,"[12, 9, 6]",0


In [146]:
# results = []
# for group_id in broken_down_groups['first_level_cluster'].unique():
#     print('First Level Group ID:',group_id)
#     print(first_level_cluster_dict[group_id])
#     df = broken_down_groups[broken_down_groups['first_level_cluster'] == group_id]
#     print('First Level Group Size: ', df.shape[0])
#     mt_1 = df['second_level_cluster'].value_counts() > 1
#     eq_1 = df['second_level_cluster'].value_counts() == 1
#     print('Single Nodes: ', eq_1.sum())
#     display(df[df['second_level_cluster'].isin(eq_1[eq_1].index)].title.to_list())
#     grouped_df = df.groupby(['second_level_cluster', 'second_level_cluster_kws']).size()[mt_1[mt_1].index].reset_index(name='No. of Articles')
#     display(grouped_df)
#     print('-----------')

#     for index, row in grouped_df.iterrows():
#         results.append([
#             group_id, 
#             first_level_cluster_dict[group_id], 
#             df.shape[0], 
#             row['second_level_cluster'], 
#             row['second_level_cluster_kws'], 
#             row['No. of Articles']
#         ])
#     results.append([
#             group_id, 
#             first_level_cluster_dict[group_id], 
#             df.shape[0], 
#             "Single Articles", 
#             "", 
#             eq_1.sum()]
#         )

# result_df = pd.DataFrame(results, columns=['Group ID', 'Group Description', 'First Level Group Size', 'Second Level Cluster', 'Second Level Cluster Keywords', 'No. of Articles'])
# result_df.to_csv(os.path.join(experiment_path,'break_down_group_clusters.csv'), index=False)


In [147]:
def get_cluster_size(pred_cluster):
    grouped_counts = pred_cluster.groupby('second_level_cluster').size()
    filtered_grouped_counts = grouped_counts[grouped_counts != 1]
    single_nodes =  len(grouped_counts[grouped_counts == 1])
    bins = range(1, filtered_grouped_counts.max() + 10, 10)
    labels = [f"{i}-{i+9}" for i in bins[:-1]]
    labels[0] = '2-10'
    binned_counts = pd.cut(filtered_grouped_counts, bins=bins, labels=labels, right=False)
    banded_counts = binned_counts.value_counts().sort_index()
    cluster_size = pd.DataFrame(banded_counts).reset_index().rename(columns={'index':"Cluster size",'count':"Num of clusters"})
    new_row = {'Cluster size': '1', 'Num of clusters': single_nodes}  # Customize with your data
    cluster_size.loc[-1] = new_row
    cluster_size = cluster_size.sort_index().reset_index(drop=True)
    return cluster_size

clusters_size = get_cluster_size(updated_pred_cluster)
display(clusters_size)
print(clusters_size['Num of clusters'].sum())

Unnamed: 0,Cluster size,Num of clusters
0,1,247
1,2-10,81
2,11-20,4
3,21-30,1
4,31-40,1


334


In [47]:
grouped_counts = updated_pred_cluster.groupby('second_level_cluster').size()
filtered_grouped_counts = grouped_counts[grouped_counts != 1]
print(f"no. of clusters: {filtered_grouped_counts.value_counts().sum()}")
print(f"min, max cluster size: {filtered_grouped_counts.min()}, {filtered_grouped_counts.max()}")
print(f"no. of single nodes: {len(grouped_counts[grouped_counts == 1])}")


no. of clusters: 87
min, max cluster size: 2, 37
no. of single nodes: 247


### Update neo4j files and pyvis
- Drop all single from neo4j_clustered_df
- Update cluster number in neo4j_clustered_df 
- For all singles, update neo4j_unclustered_df


In [142]:
def cluster_viz(
    clustered_nodes: pd.DataFrame,
    unclustered_nodes: pd.DataFrame
):
    clustered_df = clustered_nodes.copy()
    unclustered_df = unclustered_nodes.copy()
 
    visual_graph = pyvis.network.Network(select_menu=True, filter_menu=True)
 
    # Add nodes-nodes pair
    for _, row in clustered_df.iterrows():
        # Add nodes
        visual_graph.add_node(
            row["node_1_title"],
            label=row["node_1_title"],
            title=f"Predicted group: {row['node_1_pred_cluster']}\nGroup keywords: {row['node_1_cluster_kws']}\nTitle: {row['node_1_title']}",
            group=row["node_1_pred_cluster"],
        )
        visual_graph.add_node(
            row["node_2_title"],
            label=row["node_2_title"],
            title=f"Predicted group: {row['node_2_pred_cluster']}\nGroup keywords: {row['node_2_cluster_kws']}\nTitle: {row['node_2_title']}",
            group=row["node_2_pred_cluster"],
        )
 
        # Add edge
        visual_graph.add_edge(
            row["node_1_title"],
            row["node_2_title"],
            title=f"Edge Weight: {row['edge_weight']}",
        )
   
    # Add solo nodes
    for _, row in unclustered_df.iterrows():
        visual_graph.add_node(
            row["node_title"],
            label=row["node_title"],
            title=f"Predicted group: No Community\nTitle: {row['node_title']}",
        )
 
    return visual_graph.show(experiment_path+"/neo4j_cluster_viz_updated.html", notebook=False)

In [128]:
updated_single = updated_pred_cluster[updated_pred_cluster['second_level_cluster_kws'].isna()]
updated_clustered = updated_pred_cluster[updated_pred_cluster['second_level_cluster_kws'].notna()]

update_single_title_list = updated_single.title.to_list()

In [130]:
neo4j_predicted_cluster_csv_path = os.path.join(experiment_path, 'neo_4j_clustered_data.csv')
neo4j_unclustered_csv_path = os.path.join(experiment_path, 'neo_4j_unclustered_data.csv')

neo4j_clustered_df = pd.read_csv(neo4j_predicted_cluster_csv_path)
neo4j_unclustered_df = pd.read_csv(neo4j_unclustered_csv_path)

# Remove those that are labelled as single in the second level clustering
filtered_clustered_df = neo4j_clustered_df[
    ~neo4j_clustered_df.node_1_title.isin(update_single_title_list) &
    ~neo4j_clustered_df.node_2_title.isin(update_single_title_list)
]

adjusted_cluster = updated_pred_cluster[updated_pred_cluster['first_level_cluster'] != updated_pred_cluster['second_level_cluster']]


In [131]:
# Update clustered_df

def update_clusters(row, update_dict):
    node_1_title = row['node_1_title']
    node_2_title = row['node_2_title']

    node_1_pred_cluster = update_dict[node_1_title][0] if node_1_title in update_dict else row['node_1_pred_cluster']
    node_1_cluster_kws = update_dict[node_1_title][1] if node_1_title in update_dict else row['node_1_cluster_kws']

    node_2_pred_cluster = update_dict[node_2_title][0] if node_2_title in update_dict else row['node_2_pred_cluster']
    node_2_cluster_kws = update_dict[node_2_title][1] if node_2_title in update_dict else row['node_2_cluster_kws']

    return pd.Series([node_1_pred_cluster, node_1_cluster_kws, node_2_pred_cluster, node_2_cluster_kws])

# Format - title : (second_level_cluster, second_level_cluster_kws)
# Only include those groups that is adjusted
updated_clusters_dict = {
    title: (second_level_cluster, second_level_cluster_kws)
    for title, second_level_cluster, second_level_cluster_kws in zip(
        adjusted_cluster['title'],
        adjusted_cluster['second_level_cluster'],
        adjusted_cluster['second_level_cluster_kws']
    )
}

filtered_clustered_df[['node_1_pred_cluster', 'node_1_cluster_kws', 'node_2_pred_cluster', 'node_2_cluster_kws']] = filtered_clustered_df.apply(
    update_clusters, update_dict=updated_clusters_dict, axis=1
)

In [136]:
# Update unclustered_df
to_add = updated_single[~updated_single.title.isin(neo4j_unclustered_df.node_title.unique())]
to_add = to_add[['title','second_level_cluster']].rename(columns={'title':'node_title','second_level_cluster':'node_community'})
neo4j_unclustered_df_updated = pd.concat([neo4j_unclustered_df,to_add]).drop(columns='node_meta_desc')

df_node_1 = neo4j_clustered_df[['node_1_title', 'node_1_ground_truth']].rename(columns={'node_1_title': 'title', 'node_1_ground_truth': 'ground_truth'})
df_node_2 = neo4j_clustered_df[['node_2_title', 'node_2_ground_truth']].rename(columns={'node_2_title': 'title', 'node_2_ground_truth': 'ground_truth'})
combined_node_1_2 = pd.concat([df_node_1, df_node_2]).drop_duplicates()
combined_node_1_2_dict = dict(zip(combined_node_1_2['title'], combined_node_1_2['ground_truth']))

print(neo4j_unclustered_df_updated.node_ground_truth.isna().sum())
neo4j_unclustered_df_updated['node_ground_truth'] = neo4j_unclustered_df_updated['node_title'].map(combined_node_1_2_dict).fillna(neo4j_unclustered_df_updated['node_ground_truth'])
print(neo4j_unclustered_df_updated.node_ground_truth.isna().sum())

224
205


In [135]:
OUTPUT_PATH_CLUSTERED = os.path.join(experiment_path, 'neo_4j_clustered_data_2nd_level_cluster.csv')
OUTPUT_PATH_UNCLUSTERED = os.path.join(experiment_path, 'neo_4j_unclustered_data_2nd_level_cluster.csv')

filtered_clustered_df.to_csv(OUTPUT_PATH_CLUSTERED)
neo4j_unclustered_df_updated.to_csv(OUTPUT_PATH_UNCLUSTERED)

In [143]:
cluster_viz(filtered_clustered_df,neo4j_unclustered_df_updated)

../data\07_model_output\all-MiniLM-L6-v2\experiment-5b-weighted-similarities-3_title_7_body/neo4j_cluster_viz_updated.html
