In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
import os

In [22]:
# Parameters
EMBEDDING_MODEL: str = "mxbai-embed-large-v1"
POOLING_STRATEGY: str = "mean"
SIMILARITY_METRIC: str = "cos"

In [23]:
OUTPUT_THRES_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    f"{EMBEDDING_MODEL}_{SIMILARITY_METRIC}_histogram.html",
)

INPUT_GROUNDTRUTH_PATH = os.path.join(
    "..",
    "data",
    "Synapxe Content Prioritisation - Live Healthy_020724.xlsx",
)

In [24]:
ground_truth = pd.read_excel(INPUT_GROUNDTRUTH_PATH, sheet_name=2)
ground_truth = ground_truth[["Page Title", "Combine Group ID", "URL"]]

In [25]:
if POOLING_STRATEGY:
    cosine_sim_df = pd.read_csv(
    f"../artifacts/outputs/{EMBEDDING_MODEL}_{POOLING_STRATEGY}_{SIMILARITY_METRIC}_similarity_score.csv"
)
else:
    cosine_sim_df = pd.read_csv(
    f"../artifacts/outputs/{EMBEDDING_MODEL}_{SIMILARITY_METRIC}_similarity_score.csv"
)

all_col_minus_pagetitle = [e for e in cosine_sim_df.columns if e != "Page Title"]

cosine_melt = pd.melt(
    cosine_sim_df, id_vars=["Page Title"], value_vars=all_col_minus_pagetitle
)
cosine_melt["Pair"] = cosine_melt.apply(
    lambda x: {x["Page Title"], x["variable"]}, axis=1
)
cosine_melt.drop_duplicates("Pair", inplace=True)
cosine_melt = cosine_melt.rename(
    columns={"Page Title": "Page Title 1", "variable": "Page Title 2"}
)
print(cosine_melt.shape)

(17578, 4)


In [26]:
# Get ground truth for both 'Page Title 1' and 'Page Title 2'
cosine_melt_ground_truth = pd.merge(
    cosine_melt,
    ground_truth,
    how="inner",
    left_on="Page Title 1",
    right_on="Page Title",
)

cosine_melt_ground_truth = pd.merge(
    cosine_melt_ground_truth,
    ground_truth,
    how="inner",
    left_on="Page Title 2",
    right_on="Page Title",
    suffixes=('_1', '_2')
)
cosine_melt_ground_truth.head()

# Drop self match
cosine_melt_ground_truth = cosine_melt_ground_truth[
    cosine_melt_ground_truth["Pair"].apply(lambda x: len(x) != 1)
]

# Split data into cosine pairwise comparisons within the same groundtruth community
same_group_cosine = cosine_melt_ground_truth[
    cosine_melt_ground_truth["Combine Group ID_1"]
    == cosine_melt_ground_truth["Combine Group ID_2"]
]

# Split data into cosine pairwise comparisons within the diff groundtruth community
diff_group_cosine = cosine_melt_ground_truth[
    cosine_melt_ground_truth["Combine Group ID_1"]
    != cosine_melt_ground_truth["Combine Group ID_2"]
]

same_group_cosine.shape, diff_group_cosine.shape

((383, 10), (17569, 10))

In [27]:
same_group_median = np.median(same_group_cosine["value"])
diff_group_median = np.median(diff_group_cosine["value"])
print(f"Same group median: {same_group_median}")
print(f"Different group median: {diff_group_median}")

Same group median: 0.8355293
Different group median: 0.5968283


In [41]:
bin_edges_same = np.histogram_bin_edges(same_group_cosine["value"], bins=100)
bin_edges_diff = np.histogram_bin_edges(diff_group_cosine["value"], bins=100)

hist_diff_group = go.Histogram(
    x=diff_group_cosine["value"],
    xbins=dict(start=bin_edges_diff[0], end=bin_edges_diff[-1], size=bin_edges_diff[1] - bin_edges_diff[0]),
    opacity=0.7,
    name="Diff Group (DG)",
    marker=dict(color="skyblue", line=dict(color="black", width=1)),
)

hist_same_group = go.Histogram(
    x=same_group_cosine["value"],
    xbins=dict(start=bin_edges_same[0], end=bin_edges_same[-1], size=bin_edges_same[1] - bin_edges_same[0]),
    opacity=0.5,
    name="Same Group (SG)",
    marker=dict(color="orange", line=dict(color="black", width=1)),
)

fig = go.Figure(data=[hist_diff_group, hist_same_group])

# Add same-group median line to fig
fig.add_shape(
    dict(
        type="line",
        x0=same_group_median,
        y0=0,
        x1=same_group_median,
        y1=1,
        xref="x",
        yref="paper",
        line=dict(color="red", width=2, dash="dot"),
    )
)

fig.add_annotation(
    dict(
        x=same_group_median,
        y=1,
        xref="x",
        yref="paper",
        text=f"SG Median:{round(same_group_median,2)}",
        showarrow=True,
        arrowhead=2,
        ax=20,
        ay=-20,
    )
)

# Add diff-group median line to fig
fig.add_shape(
    dict(
        type="line",
        x0=diff_group_median,
        y0=0,
        x1=diff_group_median,
        y1=1,
        xref="x",
        yref="paper",
        line=dict(color="red", width=2, dash="dot"),
    )
)

fig.add_annotation(
    dict(
        x=diff_group_median,
        y=1,
        xref="x",
        yref="paper",
        text=f"DG Median:{round(diff_group_median,2)}",
        showarrow=True,
        arrowhead=2,
        ax=20,
        ay=-20,
    )
)

# Annotate difference in median
fig.add_shape(
    dict(
        type="line",
        x0=diff_group_median,
        y0=0.7,
        x1=same_group_median,
        y1=0.7,
        xref="x",
        yref="paper",
        line=dict(color="gray", width=1, dash="dot"),
    )
)

fig.add_annotation(
    dict(
        x=(same_group_median - diff_group_median)/2 +diff_group_median,
        y=0.7,
        xref="x",
        yref="paper",
        text=f"Difference:{round(same_group_median - diff_group_median,2)}",
        # showarrow=True,
        # arrowhead=2,
        ax=20,
        ay=-20,
    )
)

# fig layout
fig.update_layout(
    title=f"Histogram of {SIMILARITY_METRIC} distribution for {EMBEDDING_MODEL}",
    xaxis_title="Value",
    yaxis_title="Frequency",
    barmode="overlay",
    template="plotly_white",
)

pio.write_html(fig, file=OUTPUT_THRES_PATH)
fig.show()

In [29]:
model_results = {
    "model": [EMBEDDING_MODEL],
    "similarity_method": [SIMILARITY_METRIC],
    "diff_grp_median": [diff_group_median],
    "same_grp_median": [same_group_median],
    "median difference": [same_group_median-diff_group_median]
}
model_results_df = pd.DataFrame(model_results)

results_filepath = r"..\artifacts\outputs\median_comparison_summary.csv"

if os.path.exists(results_filepath):
    results_df = pd.read_csv(results_filepath)
    results_df = pd.concat([results_df, model_results_df])
else:
    results_df = pd.DataFrame(model_results_df)

results_df.to_csv(results_filepath, index=False)
print("file saved")

file saved


In [30]:
results_df = pd.read_csv(results_filepath)
results_df

Unnamed: 0,model,similarity_method,diff_grp_median,same_grp_median,median difference
0,doc2vec,cos,0.23245,0.70638,0.473931
1,multi-qa-mpnet-base-dot-v1,dot,0.344353,0.623807,0.279454
2,multi-qa-mpnet-base-cos-v1,dot,0.202332,0.589718,0.387386
3,multi-qa-mpnet-base-cos-v1,cos,0.252856,0.737943,0.485087
4,multi-qa-mpnet-base-cos-v1,euclidean,0.477552,0.603712,0.126161
5,bge-large-en-v1.5,cos,0.63653,0.829381,0.192851
6,bge-large-en-v1.5-quant,cos,0.644356,0.831734,0.187378
7,mxbai-embed-large-v1,cos,0.596828,0.835529,0.238701
