In [62]:
import pickle
import pandas as pd
import pyvis

In [221]:
root_path = r"C:\Users\Joycelyn\Documents\Synapxe\07 GenAI for healthhub\Github backup\cluster experiments\weighted emb vs sim (658)"
emb_path = os.path.join(root_path,"nomic_weighted_emb (0.7 body, 0.3 title)")
sim_path = os.path.join(root_path,"nomic_weighted_sim (0.7 body, 0.3 title)")

with open(os.path.join(emb_path,"neo4j_predicted_clusters.pkl"), "rb") as f:
    emb_results = pickle.load(f)

with open(os.path.join(sim_path,"neo4j_predicted_clusters.pkl"), "rb") as f:
    sim_results = pickle.load(f)

## identify common and unique clusters

In [247]:
def find_common_clusters(output1, output2):
    clusters1 = {tuple(cluster['titles']): cluster for cluster in output1}
    clusters2 = {tuple(cluster['titles']): cluster for cluster in output2}
    
    common_titles = set(clusters1.keys()) & set(clusters2.keys())
    
    common_clusters = [clusters1[titles] for titles in common_titles]
    df_common_clusters = pd.DataFrame(common_clusters)

    df_common_clusters['num_articles'] = df_common_clusters['titles'].apply(lambda x: len(x))

    return df_common_clusters

def find_different_clusters(output1, output2):
    clusters1 = {tuple(cluster['titles']): cluster for cluster in output1}
    clusters2 = {tuple(cluster['titles']): cluster for cluster in output2}
    
    unique_titles1 = set(clusters1.keys()) - set(clusters2.keys())
    unique_titles2 = set(clusters2.keys()) - set(clusters1.keys())
    
    unique_clusters1 = [clusters1[titles] for titles in unique_titles1]
    unique_clusters2 = [clusters2[titles] for titles in unique_titles2]

    df_unique_clusters1 = pd.DataFrame(unique_clusters1)
    df_unique_clusters2 = pd.DataFrame(unique_clusters2)

    df_unique_clusters1['num_articles'] = df_unique_clusters1['titles'].apply(lambda x: len(x))
    df_unique_clusters2['num_articles'] = df_unique_clusters2['titles'].apply(lambda x: len(x))

    return df_unique_clusters1, df_unique_clusters2

In [248]:
df_common_clusters = find_common_clusters(emb_results, sim_results)
df_unique_emb, df_unique_sim = find_different_clusters(emb_results, sim_results)

print(f"Number of exact same clusters: {len(df_common_clusters)} \
    \nNumber of unique clusters from EMB: {len(df_unique_emb)} \
    \nNumber of unique clusters from SIM: {len(df_unique_sim)}")

Number of exact same clusters: 24     
Number of unique clusters from EMB: 48     
Number of unique clusters from SIM: 30


In [249]:
# match unique clusters
df_unique_emb_1 = df_unique_emb.copy()
df_unique_sim_1 = df_unique_sim.copy()

# Function to calculate overlap
def calculate_overlap(row1, row2):
    return len(set(row1) & set(row2))

# List to store the matches
matches = []

for i, row1 in df_unique_emb_1.iterrows():
    for j, row2 in df_unique_sim_1.iterrows():
        overlap = calculate_overlap(row1['titles'], row2['titles'])
        matches.append((i, j, overlap))

matches_df = pd.DataFrame(matches, columns=['embeddings_index', 'sim_index', 'overlap'])

# Identify the pairs with the highest overlap
max_matches_df = matches_df.loc[matches_df.groupby('embeddings_index')['overlap'].idxmax()]
# Sort by sim_index and overlap and keep the sim_index with highest overlap
max_matches_df = max_matches_df[max_matches_df['overlap'] != 0].sort_values(['sim_index', 'overlap'], ascending=[True, False])
max_matches_df = max_matches_df.drop_duplicates(subset=['sim_index'], keep='first')  
max_matches_df

# Prepare data for full outer join
df_unique_emb['key'] = df_unique_emb.index
df_unique_sim['key'] = df_unique_sim.index

# Merge the DataFrames using the identified pairs
merged_df_title = pd.merge(
    df_unique_emb,
    max_matches_df[['embeddings_index', 'sim_index','overlap']],
    left_index=True,
    right_on='embeddings_index',
    how='outer'
).merge(
    df_unique_sim,
    left_on='sim_index',
    right_index=True,
    how='outer',
    suffixes=('_embeddings', '_sim')
).drop(columns=['embeddings_index', 'sim_index', 'key_embeddings', 'key_sim'])

In [250]:
# export dfs
save_path = os.path.join(root_path, "weighted_methods_eval.xlsx")
with pd.ExcelWriter(save_path) as writer:
    df_common_clusters.to_excel(writer, sheet_name='Common Clusters', index=False)
    df_unique_emb.to_excel(writer, sheet_name='Weighted emb unique articles', index=False)
    df_unique_sim.to_excel(writer, sheet_name='Weighted sim unique articles', index=False)
    merged_df_title.to_excel(writer, sheet_name='match_unique_clusters_keywords', index=False)

## visualise unique cluster

In [106]:
def visualize_result(clustered_df,method:str):
    visual_graph = pyvis.network.Network(select_menu=True, filter_menu=True)

    # Add nodes-nodes pair
    for _, row in clustered_df.iterrows():
        # Add nodes
        visual_graph.add_node(
            row["node_1_title"],
            label=row["node_1_title"],
            title=f"\nPredicted: {row['node_1_pred_cluster']}\nTitle: {row['node_1_title']}\nKeywords: {row['node_1_cluster_kws']}",
            group=row["node_1_cluster_kws"],
            cluster_num=row["node_1_pred_cluster"]
        )
        visual_graph.add_node(
            row["node_2_title"],
            label=row["node_2_title"],
            title=f"\nPredicted: {row['node_2_pred_cluster']}\nTitle: {row['node_2_title']}\nKeywords: {row['node_2_cluster_kws']}",
            group=row["node_2_cluster_kws"],
            cluster_num=row["node_2_pred_cluster"]
        )

        # Add edge
        visual_graph.add_edge(
            row["node_1_title"],
            row["node_2_title"],
            title=f"Edge Weight: {row['edge_weight']}",
        )

    visual_graph.show(f"neo4j_{method}.html", notebook=False)

def get_unique_inter_grp(df):
    df = df[df["node_1_pred_cluster"] != df["node_2_pred_cluster"]]
    unique_pairs = df[['node_1_pred_cluster', 'node_2_pred_cluster']].drop_duplicates()
    num_rows_with_unique_pairs = unique_pairs.shape[0]
    return num_rows_with_unique_pairs

In [81]:
clustered_nodes_emb = pd.read_csv(os.path.join(emb_path,"neo_4j_clustered_data.csv"))
clustered_nodes_sim = pd.read_csv(os.path.join(sim_path,"neo_4j_clustered_data.csv"))

In [104]:
print("Number of intergroup connections")
print(f"EMB method: {len(clustered_nodes_emb[clustered_nodes_emb["node_1_pred_cluster"] != clustered_nodes_emb["node_2_pred_cluster"]])}")
print(f"SIM method: {len(clustered_nodes_sim[clustered_nodes_sim["node_1_pred_cluster"] != clustered_nodes_sim["node_2_pred_cluster"]])}")

print("\nNumber of unique interconnected groups")
print(f"EMB method: {get_unique_inter_grp(clustered_nodes_emb)}")
print(f"SIM method: {get_unique_inter_grp(clustered_nodes_sim)}")

Number of intergroup connections
EMB method: 15
SIM method: 35

Number of unique interconnected groups
EMB method: 7
SIM method: 15


In [107]:
# visualisation for unique emb clusters
unique_emb_list = list(df_unique_emb["cluster"])
clustered_nodes_emb_filtered = clustered_nodes_emb[
    clustered_nodes_emb['node_1_pred_cluster'].isin(unique_emb_list) &
    clustered_nodes_emb['node_2_pred_cluster'].isin(unique_emb_list)
]
visualize_result(clustered_nodes_emb_filtered, 'emb')

neo4j_emb.html


In [108]:
# visualisation for unique sim clusters
unique_sim_list = list(df_unique_sim["cluster"])
clustered_nodes_sim_filtered = clustered_nodes_sim[
    clustered_nodes_sim['node_1_pred_cluster'].isin(unique_sim_list) &
    clustered_nodes_sim['node_2_pred_cluster'].isin(unique_sim_list)
]
visualize_result(clustered_nodes_sim_filtered, 'sim')

neo4j_sim.html


## Comparing unique clusters

In [246]:
emb_cluster_num = 532
sim_cluster_num = 592

emb_titles_list = df_unique_emb[df_unique_emb["cluster"]==emb_cluster_num]["titles"].iloc[0]
sim_titles_list = df_unique_sim[df_unique_sim["cluster"]==sim_cluster_num]["titles"].iloc[0]

emb_set = set(emb_titles_list)
sim_set = set(sim_titles_list)
common_articles = emb_set.intersection(sim_set)

print(f"Emb cluster size: {len(emb_titles_list)} \
    \nSim cluster size: {len(sim_titles_list)} \
    \nNumber of overlap articles: {len(common_articles)}"
)

print(f"\nUnique articles in EMB cluster: \
    \n{list(emb_set - sim_set)} \
    \n\nUnique articles in SIM cluster: \
    \n{list(sim_set - emb_set)}"
)

Emb cluster size: 41     
Sim cluster size: 36     
Number of overlap articles: 31

Unique articles in EMB cluster:     
["Life is Better When You're Sober", 'Alcohol and Health—Set Your Drinking Limits', 'Why is Binge Drinking Bad for You?', 'Know Your Alcohol Limit: Don’t Be a Party Pooper!', 'Drinking (or not) to a Healthy Chinese New Year', 'Alcohol — More than Meets the Eye', 'Drinking Myths Busted!', 'Staying Sober and Within the Alcohol Limit', 'Responsible Drinking: Know Your Alcohol Limit', 'Ditch Both that Cigarette and Drink!']     

Unique articles in SIM cluster:     
['Environmental Tobacco Smoke', 'Are e-cigarettes harmful?', 'Effects of Secondhand Smoke on Your Child’s Health', 'Smoke-free Environment for a Healthier Family', '"Vaping is not smoking", and Other Tobacco Myths']


## End