In [1]:
import os

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
# Parameters
method = "sbert_embeddings"

In [26]:
def plot_histogram(same_group_median, diff_group_median):
    bin_edges_same = np.histogram_bin_edges(same_group_cosine["value"], bins=100)
    bin_edges_diff = np.histogram_bin_edges(diff_group_cosine["value"], bins=100)

    hist_diff_group = go.Histogram(
        x=diff_group_cosine["value"],
        xbins=dict(
            start=bin_edges_diff[0],
            end=bin_edges_diff[-1],
            size=bin_edges_diff[1] - bin_edges_diff[0],
        ),
        opacity=0.7,
        name="Diff Group (DG)",
        marker=dict(color="skyblue", line=dict(color="black", width=1)),
    )

    hist_same_group = go.Histogram(
        x=same_group_cosine["value"],
        xbins=dict(
            start=bin_edges_same[0],
            end=bin_edges_same[-1],
            size=bin_edges_same[1] - bin_edges_same[0],
        ),
        opacity=0.5,
        name="Same Group (SG)",
        marker=dict(color="orange", line=dict(color="black", width=1)),
    )

    fig = go.Figure(data=[hist_diff_group, hist_same_group])

    # Add same-group median line to fig
    fig.add_shape(
        dict(
            type="line",
            x0=same_group_median,
            y0=0,
            x1=same_group_median,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="red", width=2, dash="dot"),
        )
    )

    fig.add_annotation(
        dict(
            x=same_group_median,
            y=1,
            xref="x",
            yref="paper",
            text=f"SG Median:{round(same_group_median,2)}",
            showarrow=True,
            arrowhead=2,
            ax=20,
            ay=-20,
        )
    )

    # Add diff-group median line to fig
    fig.add_shape(
        dict(
            type="line",
            x0=diff_group_median,
            y0=0,
            x1=diff_group_median,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="red", width=2, dash="dot"),
        )
    )

    fig.add_annotation(
        dict(
            x=diff_group_median,
            y=1,
            xref="x",
            yref="paper",
            text=f"DG Median:{round(diff_group_median,2)}",
            showarrow=True,
            arrowhead=2,
            ax=20,
            ay=-20,
        )
    )

    # Annotate difference in median
    fig.add_shape(
        dict(
            type="line",
            x0=diff_group_median,
            y0=0.7,
            x1=same_group_median,
            y1=0.7,
            xref="x",
            yref="paper",
            line=dict(color="gray", width=1, dash="dot"),
        )
    )

    fig.add_annotation(
        dict(
            x=(same_group_median - diff_group_median) / 2 + diff_group_median,
            y=0.7,
            xref="x",
            yref="paper",
            text=f"Difference:{round(same_group_median - diff_group_median,2)}",
            # showarrow=True,
            # arrowhead=2,
            ax=20,
            ay=-20,
        )
    )

    # fig layout
    fig.update_layout(
        title=f"Histogram of similarity score distribution for {sheet}",
        xaxis_title="Value",
        yaxis_title="Frequency",
        barmode="overlay",
        template="plotly_white",
    )

    OUTPUT_THRES_PATH = os.path.join(
        "..",
        "artifacts",
        "outputs",
        f"{sheet}_histogram.html",
    )
    pio.write_html(fig, file=OUTPUT_THRES_PATH)
    fig.show()

In [20]:
INPUT_GROUNDTRUTH_PATH = os.path.join(
    "..",
    "data",
    "Synapxe Content Prioritisation - Live Healthy_020724.xlsx",
)

INPUT_SIMILARITY_SCORE_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    f"{method}_similarity_score.xlsx",
)

In [17]:
ground_truth = pd.read_excel(INPUT_GROUNDTRUTH_PATH, sheet_name=2)
ground_truth = ground_truth[["Page Title", "Combine Group ID", "URL"]]

In [27]:
xls = pd.ExcelFile(INPUT_SIMILARITY_SCORE_PATH)
sheet_names = xls.sheet_names

for sheet in sheet_names:
    cosine_sim_df = pd.read_excel(INPUT_SIMILARITY_SCORE_PATH, sheet_name=sheet)

    EMBEDDING_MODEL: str = sheet.split("_")[0]
    SIMILARITY_METRIC: str = sheet.split("_")[1]

    all_col_minus_pagetitle = cosine_sim_df.columns[1:].tolist()
    cosine_melt = pd.melt(
        cosine_sim_df,
        id_vars=cosine_sim_df.columns[0],
        value_vars=all_col_minus_pagetitle,
    )
    cosine_melt = cosine_melt.rename(columns={cosine_melt.columns[0]: "Page Title"})
    cosine_melt["Pair"] = cosine_melt.apply(
        lambda x: {x["Page Title"], x["variable"]}, axis=1
    )
    cosine_melt.drop_duplicates("Pair", inplace=True)
    cosine_melt = cosine_melt.rename(
        columns={"Page Title": "Page Title 1", "variable": "Page Title 2"}
    )

    # Get ground truth for both 'Page Title 1' and 'Page Title 2'
    cosine_melt_ground_truth = pd.merge(
        cosine_melt,
        ground_truth,
        how="inner",
        left_on="Page Title 1",
        right_on="Page Title",
    )

    cosine_melt_ground_truth = pd.merge(
        cosine_melt_ground_truth,
        ground_truth,
        how="inner",
        left_on="Page Title 2",
        right_on="Page Title",
        suffixes=("_1", "_2"),
    )

    # Drop self match
    cosine_melt_ground_truth = cosine_melt_ground_truth[
        cosine_melt_ground_truth["Pair"].apply(lambda x: len(x) != 1)
    ]

    # Split data into cosine pairwise comparisons within the same groundtruth community
    same_group_cosine = cosine_melt_ground_truth[
        cosine_melt_ground_truth["Combine Group ID_1"]
        == cosine_melt_ground_truth["Combine Group ID_2"]
    ]

    # Split data into cosine pairwise comparisons within the diff groundtruth community
    diff_group_cosine = cosine_melt_ground_truth[
        cosine_melt_ground_truth["Combine Group ID_1"]
        != cosine_melt_ground_truth["Combine Group ID_2"]
    ]

    # Get median of both groups
    same_group_median = np.median(same_group_cosine["value"])
    diff_group_median = np.median(diff_group_cosine["value"])

    # Plot histogram
    plot_histogram(same_group_median, diff_group_median)

    # Save results
    model_results = {
        "model": [EMBEDDING_MODEL],
        "similarity_method": [SIMILARITY_METRIC],
        "diff_grp_median": [diff_group_median],
        "same_grp_median": [same_group_median],
        "median difference": [same_group_median - diff_group_median],
    }
    model_results_df = pd.DataFrame(model_results)

    results_filepath = r"..\artifacts\outputs\median_comparison_summary2.csv"

    if os.path.exists(results_filepath):
        results_df = pd.read_csv(results_filepath)
        results_df = pd.concat([results_df, model_results_df])
    else:
        results_df = pd.DataFrame(model_results_df)

    results_df.to_csv(results_filepath, index=False)