Note: For this notebook, clustering is only done on those with ground truth labels


In [83]:
import os
import pickle

import networkx as nx
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.metrics import completeness_score, homogeneity_score, v_measure_score

In [84]:
# Parameters

EMBEDDING_MODEL: str = "tfidf_cosine"

In [85]:
OUTPUT_THRES_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    f"{EMBEDDING_MODEL}_cosine_similarity_histogram.html",
)

OUTPUT_GRAPH_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    f"{EMBEDDING_MODEL}_graph",
)

OUTPUT_CSV_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    f"{EMBEDDING_MODEL}_graph_louvain_cluster.csv",
)

In [86]:
INPUT_GROUNDTRUTH_PATH = os.path.join(
    "..",
    "data",
    "Synapxe Content Prioritisation - Live Healthy_020724.xlsx",
)

In [87]:
ground_truth = pd.read_excel(INPUT_GROUNDTRUTH_PATH, sheet_name=2)
ground_truth = ground_truth[
    ground_truth["Owner"].str.contains("Health Promotion Board")
]
ground_truth = ground_truth[["Page Title", "Combine Group ID", "URL"]]
ground_truth = ground_truth[ground_truth["Combine Group ID"].notna()]

In [88]:
# Change cosine matrix to long format & keep only unique pairs

cosine_sim_df = pd.read_csv(
    f"../artifacts/outputs/{EMBEDDING_MODEL}_similarity_score.csv"
)
cosine_sim_df.rename(columns={cosine_sim_df.columns[0]: "Page Title"}, inplace=True)
all_col_minus_pagetitle = [e for e in cosine_sim_df.columns if e != "Page Title"]
cosine_melt = pd.melt(
    cosine_sim_df, id_vars=["Page Title"], value_vars=all_col_minus_pagetitle
)
cosine_melt["Pair"] = cosine_melt.apply(
    lambda x: {x["Page Title"], x["variable"]}, axis=1
)
cosine_melt.drop_duplicates("Pair", inplace=True)
cosine_melt = cosine_melt.rename(
    columns={"Page Title": "Page Title 1", "variable": "Page Title 2"}
)
cosine_melt.sample(2)
print(cosine_melt.shape)

(17578, 4)


In [89]:
# Get ground truth for both 'Page Title 1' and 'Page Title 2'
cosine_melt_ground_truth = pd.merge(
    cosine_melt,
    ground_truth,
    how="inner",
    left_on="Page Title 1",
    right_on="Page Title",
)
cosine_melt_ground_truth = pd.merge(
    cosine_melt_ground_truth,
    ground_truth,
    how="inner",
    left_on="Page Title 2",
    right_on="Page Title",
)

# Drop self match
cosine_melt_ground_truth = cosine_melt_ground_truth[
    cosine_melt_ground_truth["Pair"].apply(lambda x: len(x) != 1)
]

# Split data into cosine pairwise comparisons within the same groundtruth community
same_group_cosine = cosine_melt_ground_truth[
    cosine_melt_ground_truth["Combine Group ID_x"]
    == cosine_melt_ground_truth["Combine Group ID_y"]
]
same_group_median = np.median(same_group_cosine["value"])
same_group_mean = np.mean(same_group_cosine["value"])

# Split data into cosine pairwise comparisons within the diff groundtruth community
diff_group_cosine = cosine_melt_ground_truth[
    cosine_melt_ground_truth["Combine Group ID_x"]
    != cosine_melt_ground_truth["Combine Group ID_y"]
]
diff_group_median = np.median(diff_group_cosine["value"])
diff_group_mean = np.mean(diff_group_cosine["value"])

print(
    f"Same Groundtruth Community\n Mean: {same_group_mean}\n Median: {same_group_median}\n"
)
print(
    f"Diff Groundtruth Community\n Mean: {diff_group_mean}\n Median: {diff_group_median}\n"
)

Same Groundtruth Community
 Mean: 0.585072926898532
 Median: 0.6061922576531102

Diff Groundtruth Community
 Mean: 0.11988543465128203
 Median: 0.08002242633251011



In [90]:
cosine_melt_ground_truth[
    ["Page Title 1", "Page Title 2", "Combine Group ID_x", "Combine Group ID_y"]
].head(5)

Unnamed: 0,Page Title 1,Page Title 2,Combine Group ID_x,Combine Group ID_y
1,Getting Your Baby Started on Solids,Getting ready for solids,1.0,1.0
2,Feeding Your Baby Solid Food: Baby's First Foo...,Getting ready for solids,1.0,1.0
3,"Ready, Steady, Wean! Why, When and How to Move...",Getting ready for solids,1.0,1.0
4,What A Weaning Baby Needs For Strong Immunity,Getting ready for solids,1.0,1.0
5,Baby’s First Year: Go Steady With Solids,Getting ready for solids,1.0,1.0


In [91]:
bin_edges = np.histogram_bin_edges(same_group_cosine["value"], bins=100)
hist_diff_group = go.Histogram(
    x=diff_group_cosine["value"],
    xbins=dict(start=bin_edges[0], end=bin_edges[-1], size=bin_edges[1] - bin_edges[0]),
    opacity=0.7,
    name="Diff Group",
    marker=dict(color="skyblue", line=dict(color="black", width=1)),
)

hist_same_group = go.Histogram(
    x=same_group_cosine["value"],
    xbins=dict(start=bin_edges[0], end=bin_edges[-1], size=bin_edges[1] - bin_edges[0]),
    opacity=0.5,
    name="Same Group",
    marker=dict(color="orange", line=dict(color="black", width=1)),
)

fig = go.Figure(data=[hist_diff_group, hist_same_group])

vertical_line_value_median = same_group_median
vertical_line_value_mean = same_group_mean

# Add median line to fig
fig.add_shape(
    dict(
        type="line",
        x0=vertical_line_value_median,
        y0=0,
        x1=vertical_line_value_median,
        y1=1,
        xref="x",
        yref="paper",
        line=dict(color="red", width=2, dash="dot"),
    )
)

fig.add_annotation(
    dict(
        x=vertical_line_value_median,
        y=1,
        xref="x",
        yref="paper",
        text="Median",
        showarrow=True,
        arrowhead=2,
        ax=20,
        ay=-20,
    )
)

# Add mean line to plot
fig.add_shape(
    dict(
        type="line",
        x0=same_group_mean,
        y0=0,
        x1=same_group_mean,
        y1=1,
        xref="x",
        yref="paper",
        line=dict(color="blue", width=2, dash="dot"),
    )
)

fig.add_annotation(
    dict(
        x=same_group_mean,
        y=1,
        xref="x",
        yref="paper",
        text="Mean",
        showarrow=True,
        arrowhead=2,
        ax=-20,
        ay=-20,
    )
)

fig.update_layout(
    title=f"Histogram of Cosine Similarity Distribution for {EMBEDDING_MODEL}",
    xaxis_title="Value",
    yaxis_title="Frequency",
    barmode="overlay",
    template="plotly_white",
)

pio.write_html(fig, file=OUTPUT_THRES_PATH)
fig.show()

In [92]:
def construct_graph(df, threshold):
    G = nx.Graph(threshold=threshold)
    for index, row in df.iterrows():
        # G.add_node(row['URL'], **row)
        G.add_node(row["Page Title"], **row)

    return G

In [93]:
def construct_edges(graph, cosine_matrix, threshold=0.5):
    for i in range(len(cosine_matrix)):
        for j in range(i + 1, len(cosine_matrix)):
            text_similarity = cosine_matrix.iloc[i, j]
            if text_similarity > threshold:
                graph.add_edge(
                    cosine_matrix.index[i],
                    cosine_matrix.columns[j],
                    weight=text_similarity,
                )
    return graph

In [94]:
def louvain_cluster(graph, resolution=1):
    clusters = nx.community.louvain_communities(
        graph, seed=123, weight="weight", resolution=resolution
    )
    return clusters

In [95]:
# def get_exact_match(pred_df):
#     pred_cluster_labels = set(pred_df["cluster_label"].tolist())
#     ground_cluster_labels = set(pred_df["Combine Group ID"].tolist())

#     pred_set = []
#     ground_set = []

#     for i in pred_cluster_labels:
#         temp = pred_df[pred_df["cluster_label"] == i]
#         pred_set.append(temp["Page Title"].tolist())

#     for i in ground_cluster_labels:
#         temp = pred_df[pred_df["Combine Group ID"] == i]
#         ground_set.append(temp["Page Title"].tolist())

#     exact_match = 0

#     for i in ground_set:
#         if i in pred_set:
#             exact_match += 1

#     return exact_match


def get_exact_match(ground_truth):
    pred_cluster_labels = (
        ground_truth.groupby("cluster_label")["Page Title"].apply(set).to_list()
    )
    ground_cluster_labels = (
        ground_truth.groupby("Combine Group ID")["Page Title"].apply(set).to_list()
    )
    complete_match = [s for s in pred_cluster_labels if s in ground_cluster_labels]

    return len(complete_match)


def fill_single(series):
    max_val = series.max()
    fill_in_val = max_val
    filled_series = series.copy()
    for idx in series[series.isna()].index:
        filled_series.at[idx] = fill_in_val + 1
        fill_in_val += 1
    return filled_series.to_list()


def compute_vmeasure(pred_df):
    ground_truth_labels = fill_single(pred_df["Combine Group ID"])
    predicted_labels = fill_single(pred_df["cluster_label"])
    homogeneity = homogeneity_score(ground_truth_labels, predicted_labels)
    completeness = completeness_score(ground_truth_labels, predicted_labels)
    v_measure = v_measure_score(ground_truth_labels, predicted_labels)

    return homogeneity, completeness, v_measure

In [96]:
cosine_sim_df.index = cosine_sim_df["Page Title"]
cosine_sim_df.drop(columns=["Page Title"], inplace=True)

thresh = round(same_group_median, 2)
ground_truth_nona = ground_truth[ground_truth["Combine Group ID"].notna()]
graph = construct_graph(ground_truth_nona, thresh)
graph = construct_edges(graph, cosine_sim_df, thresh)

In [97]:
output_file_path = os.path.join(OUTPUT_GRAPH_PATH)

with open(output_file_path, "wb") as f:
    pickle.dump(graph, f)

In [98]:
clusters = louvain_cluster(graph)
cluster_mapping = {
    source: label for label, cluster in enumerate(clusters) for source in cluster
}

ground_truth["cluster_label"] = ground_truth["Page Title"].map(cluster_mapping)
ground_truth.to_csv(OUTPUT_CSV_PATH)
ground_truth.sample(1)

Unnamed: 0,Page Title,Combine Group ID,URL,cluster_label
46,4 Ways to Reduce Your Daily Sugar Intake,11.0,https://www.healthhub.sg/live-healthy/cut-the-...,8


In [99]:
exact_match = get_exact_match(ground_truth)
homogeneity, completeness, v_measure = compute_vmeasure(ground_truth)
cluster_size = len([clus for clus in clusters if len(clus) > 1])
articles_not_clustered = len([clus for clus in clusters if len(clus) == 1])

### Save metrics


In [100]:
file_path = "../artifacts/outputs/compiled_model_variation_metrics.csv"
if os.path.exists(file_path):
    df = pd.read_csv(file_path, index_col=0)
else:
    df = pd.DataFrame()

data = pd.DataFrame(
    {
        "Exact cluster match": [exact_match],
        "Homogeneity": [round(homogeneity, 4)],
        "Completeness": [round(completeness, 4)],
        "V-measure": [round(v_measure, 4)],
        "Number of clusters": [round(cluster_size)],
        "Number of articles not clustered": [articles_not_clustered],
    }
)

data = data.T
data.columns = [EMBEDDING_MODEL]

df = pd.concat([df, data], axis=1)
df.to_csv(file_path)

Unnamed: 0,mxbai-embed-large-v1_mean,bge-large-en-v1.5-quant_mean,bge-large-en-v1.5_mean,mxbai-embed-large-v1_mean.1,bge-large-en-v1.5-quant_mean.1,bge-large-en-v1.5_mean.1,d2v,multi-qa-mpnet-base-cos-v1_mean_euclidean,multi-qa-mpnet-base-dot-v1_cls_dot,multi-qa-mpnet-base-cos-v1_mean_dot,...,tfidf_manhattan,lsa_cosine,lsa_euclidean,lsa_dot,lsa_manhattan,lda_cosine,lda_euclidean,lda_dot,lda_manhattan,tfidf_cosine
Exact cluster match,9.0,8.0,9.0,9.0,8.0,9.0,6.0,7.0,14.0,8.0,...,1.0,11.0,11.0,11.0,1.0,0.0,0.0,1.0,0.0,8.0
Homogeneity,0.9786,0.9731,0.9762,0.9786,0.9731,0.9762,0.972,0.9784,0.9816,0.981,...,0.9588,0.9815,0.9821,0.9815,0.949,0.9442,0.8987,0.9491,0.9125,0.8592
Completeness,0.9609,0.9628,0.9622,0.9609,0.9628,0.9622,0.9625,0.962,0.9631,0.9584,...,0.9576,0.96,0.9597,0.96,0.9653,0.9575,0.957,0.9557,0.959,0.8319
V-measure,0.9697,0.9679,0.9692,0.9697,0.9679,0.9692,0.9672,0.9702,0.9723,0.9696,...,0.9582,0.9706,0.9708,0.9706,0.9571,0.9508,0.927,0.9524,0.9352,0.8453
Number of clusters,25.0,23.0,27.0,25.0,23.0,27.0,23.0,25.0,30.0,25.0,...,9.0,27.0,27.0,27.0,9.0,12.0,2.0,16.0,9.0,27.0
Number of articles not clustered,150.0,141.0,139.0,150.0,141.0,139.0,136.0,148.0,139.0,160.0,...,158.0,151.0,153.0,151.0,120.0,115.0,115.0,115.0,115.0,65.0
