In [1]:
import pickle
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio

from alive_progress import alive_bar
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import homogeneity_completeness_v_measure


In [2]:
# Parameters

embedding_model: str = 'tfidf'

In [3]:
ground_truth = pd.read_excel('../data/Synapxe Content Prioritisation - Live Healthy_020724.xlsx', sheet_name=2)
ground_truth = ground_truth[['Page Title','Combine Group ID']]

In [4]:
# Change cosine matrix to long format & keep only unique pairs

cosine_sim_df = pd.read_csv(f'../data/{embedding_model}_sim.csv')
all_col_minus_pagetitle = [e for e in cosine_sim_df.columns if e != 'Page Title']
cosine_melt = pd.melt(cosine_sim_df, id_vars=['Page Title'], value_vars= all_col_minus_pagetitle)
cosine_melt['Pair'] = cosine_melt.apply(lambda x: {x['Page Title'],x['variable']}, axis=1)
cosine_melt.drop_duplicates('Pair', inplace=True)
cosine_melt = cosine_melt.rename(columns={'Page Title':'Page Title 1', 'variable':'Page Title 2'})
cosine_melt.sample(2)
print(cosine_melt.shape)

(17391, 4)


In [5]:
# Get ground truth for both 'Page Title 1' and 'Page Title 2'
cosine_melt_ground_truth = pd.merge(cosine_melt,ground_truth, how='inner', left_on='Page Title 1', right_on='Page Title')
cosine_melt_ground_truth = pd.merge(cosine_melt_ground_truth, ground_truth, how='inner', left_on='Page Title 2', right_on='Page Title')

# Drop self match
cosine_melt_ground_truth = cosine_melt_ground_truth[cosine_melt_ground_truth['Pair'].apply(lambda x: len(x)!=1)]

# Split data into cosine pairwise comparisons within the same groundtruth community
same_group_cosine = cosine_melt_ground_truth[cosine_melt_ground_truth['Combine Group ID_x'] == cosine_melt_ground_truth['Combine Group ID_y']]
same_group_median = np.median(same_group_cosine['value'])
same_group_mean = np.mean(same_group_cosine['value'])

# Split data into cosine pairwise comparisons within the diff groundtruth community
diff_group_cosine = cosine_melt_ground_truth[cosine_melt_ground_truth['Combine Group ID_x'] != cosine_melt_ground_truth['Combine Group ID_y']]
diff_group_median = np.median(diff_group_cosine['value'])
diff_group_mean = np.mean(diff_group_cosine['value'])

print(f'Same Groundtruth Community\n Mean: {same_group_mean}\n Median: {same_group_median}\n' )
print(f'Diff Groundtruth Community\n Mean: {diff_group_mean}\n Median: {diff_group_median}\n' )

Same Groundtruth Community
 Mean: 0.5668592554706916
 Median: 0.5906424924365843

Diff Groundtruth Community
 Mean: 0.12012280707438341
 Median: 0.0802431382432036



In [6]:
bin_edges = np.histogram_bin_edges(same_group_cosine['value'], bins=100)
hist_diff_group = go.Histogram(
    x=diff_group_cosine['value'],
    xbins=dict(start=bin_edges[0], end=bin_edges[-1], size=bin_edges[1] - bin_edges[0]),
    opacity=0.7,
    name='Diff Group',
    marker=dict(color='skyblue', line=dict(color='black', width=1))
)

hist_same_group = go.Histogram(
    x=same_group_cosine['value'],
    xbins=dict(start=bin_edges[0], end=bin_edges[-1], size=bin_edges[1] - bin_edges[0]),
    opacity=0.5,
    name='Same Group',
    marker=dict(color='orange', line=dict(color='black', width=1))
)

fig = go.Figure(data=[hist_diff_group, hist_same_group])

vertical_line_value_median=same_group_median
vertical_line_value_mean = same_group_mean

# Add median line to fig
fig.add_shape(
    dict(
        type="line",
        x0=vertical_line_value_median,
        y0=0,
        x1=vertical_line_value_median,
        y1=1,
        xref='x',
        yref='paper',
        line=dict(color="red", width=2, dash="dot")
    )
)

fig.add_annotation(
    dict(
        x=vertical_line_value_median,
        y=1,
        xref='x',
        yref='paper',
        text="Median",
        showarrow=True,
        arrowhead=2,
        ax=20,
        ay=-20
    )
)

# Add mean line to plot
fig.add_shape(
    dict(
        type="line",
        x0=same_group_mean,
        y0=0,
        x1=same_group_mean,
        y1=1,
        xref='x',
        yref='paper',
        line=dict(color="blue", width=2, dash="dot")
    )
)

fig.add_annotation(
    dict(
        x=same_group_mean,
        y=1,
        xref='x',
        yref='paper',
        text="Mean",
        showarrow=True,
        arrowhead=2,
        ax=-20,
        ay=-20
    )
)

fig.update_layout(
    title=f'Histogram of Cosine Similarity Distribution for {embedding_model}',
    xaxis_title='Value',
    yaxis_title='Frequency',
    barmode='overlay',
    template='plotly_white'
)

fig.show()
pio.write_html(fig, file=f'../graph/cosine_similarity_histogram_{embedding_model}.html')

In [7]:
def construct_graph(df):
    G = nx.Graph()
    for index,row in df.iterrows():
        # G.add_node(row['URL'], **row)
        G.add_node(row['Page Title'], **row)

    return G

In [8]:
def construct_edges(graph,cosine_matrix,threshold=0.5):
    for i in range(len(cosine_matrix)):
        for j in range(i + 1, len(cosine_matrix)):
            text_similarity = cosine_matrix.iloc[i,j]
            if text_similarity > threshold:
                graph.add_edge(cosine_matrix.index[i], cosine_matrix.columns[j], weight=text_similarity)
    return graph

In [9]:
def louvain_cluster(graph,resolution=1):
    clusters = nx.community.louvain_communities(graph, seed=123,weight='weight',resolution=resolution)
    return clusters

In [10]:
def get_exact_match(pred_df):
    pred_cluster_labels = set(pred_df['cluster_label'].tolist())
    ground_cluster_labels = set(pred_df['Combine Group ID'].tolist())

    pred_set = []
    ground_set = []

    for i in pred_cluster_labels:
        temp = pred_df[pred_df['cluster_label'] == i]
        pred_set.append(temp['Page Title'].tolist())

    for i in ground_cluster_labels:
        temp = pred_df[pred_df['Combine Group ID'] == i]
        ground_set.append(temp['Page Title'].tolist())

    exact_match = 0

    for i in ground_set:
        if i in pred_set:
            exact_match += 1
    
    return exact_match

In [11]:
cosine_sim_df.index = cosine_sim_df['Page Title']
cosine_sim_df.drop(columns=['Page Title'], inplace=True)

thresh = round(same_group_median, 2)
graph = construct_graph(ground_truth)
graph = construct_edges(graph,cosine_sim_df,thresh)
pickle.dump(graph, open(f'../graph/{embedding_model}/{embedding_model}_graph_' + str(thresh).replace(".", "_"), 'wb'))

In [12]:
import os

output_dir = f'../graph/{embedding_model}/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_file_path = os.path.join(output_dir, f'{embedding_model}_graph_' + str(thresh).replace(".", "_"))

with open(output_file_path, 'wb') as f:
    pickle.dump(graph, f)

In [13]:
clusters = louvain_cluster(graph)
cluster_mapping = {source: label for label, cluster in enumerate(clusters) for source in cluster}

ground_truth['cluster_label'] = ground_truth['Page Title'].map(cluster_mapping)
ground_truth.to_csv(f"../graph/{embedding_model}/{embedding_model}_graph_louvain_cluster.csv")
ground_truth.sample(1)

Unnamed: 0,Page Title,Combine Group ID,cluster_label
67,Understanding Stress,15.0,36


In [14]:
exact_match = get_exact_match(ground_truth)
print('Exact cluster match: ', exact_match)
print('Cluster Size (including cluster size = 1)', len(clusters))
print('Cluster Size (excluding cluster size = 1): ', len([clus for clus in clusters if len(clus)>1]))

Exact cluster match:  5
Cluster Size (including cluster size = 1) 920
Cluster Size (excluding cluster size = 1):  26
