In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

In [None]:
all_decisions_df = pd.read_csv("results/all_decisions_df.csv")
all_decisions_df.fillna({"model": ""}, inplace=True)

In [None]:
def iterate_experiments(df: pd.DataFrame) -> pd.DataFrame:
    for model in df["model"].unique():
        model_df = df.query("model == @model")
        for dataset in df["dataset"].unique():
            dataset_df = model_df.query("dataset == @dataset")
            for task_scope in df["task_scope"].unique():
                task_scope_df = dataset_df.query("task_scope == @task_scope")
                for experiment_run in task_scope_df["experiment_run"].unique():
                    if task_scope == "n-gram":
                        experiment_df = task_scope_df
                    else:
                        experiment_df = task_scope_df.query(
                            "experiment_run == @experiment_run"
                        )
                    yield model, dataset, task_scope, experiment_run, experiment_df

In [None]:
combinations = []
ignore_datasets_df = all_decisions_df.copy()
ignore_datasets_df["dataset"] = "all"

for model, _, task_scope, experiment_run, experiment_df in iterate_experiments(
    ignore_datasets_df
):
    _scope_P = set(
        (p[0], p[1])
        for p in experiment_df.query("(decision == 'yes')")[
            ["source", "target"]
        ].values
    )
    for (
        other_model,
        _,
        other_task_scope,
        other_experiment_run,
        other_experiment_df,
    ) in iterate_experiments(ignore_datasets_df):
        # skip inter model combinations
        if (
            (model == "GPT-3.5") and (other_model == "GPT-4")
        ) or (
            (model == "GPT-4") and (other_model == "GPT-3.5")
        ):
            continue

        # use a simple average when we have the same task scope twice
        if (task_scope == other_task_scope) and (experiment_run != other_experiment_run):
            continue

        # get the true positives for the other task scope
        _other_scope_P = set(
            (p[0], p[1])
            for p in other_experiment_df.query("(decision == 'yes')")[
                ["source", "target"]
            ].values
        )

        combinations.append(
            {
                "from": task_scope,
                "from_run": experiment_run,
                "from_model": model,
                "from_P": len(_scope_P),
                "to": other_task_scope,
                "to_run": other_experiment_run,
                "to_model": other_model,
                "to_P": len(_other_scope_P),
                "comb_P": len(_scope_P | _other_scope_P),
            }
        )

combinations_df = pd.DataFrame(combinations)
combinations_df

In [None]:
scope_order = ["n-gram", "1-to-1", "1-to-n", "n-to-1", "n-to-n"]

combination_table = pd.pivot(
    combinations_df
    .groupby(["from", "to", "to_model"])["comb_P"]
    .mean()
    .reset_index(),
    index="from",
    columns=["to_model", "to"],
    values="comb_P",
).loc[
    scope_order,
    list(zip([""] + ["GPT-3.5"] * 4 + ["GPT-4"] * 3, scope_order + scope_order[2:]))
]

In [None]:
import plotly.graph_objects as go


def should_visualize(from_scope: str, to_scope: str) -> bool:
    """TO create a diagonal table, this method helps in deciding which cell to draw."""
    return scope_order.index(from_scope) <= scope_order.index(to_scope)


values = [
    [
        value if should_visualize(from_scope, to_scope) else 0.0
        for (model, to_scope), value in row.items()
    ]
    for from_scope, row in combination_table.iterrows()
]
texts = [
    [
        f"{value:.1f}" if should_visualize(from_scope, to_scope) else ""
        for (model, to_scope), value in row.items()
    ]
    for from_scope, row in combination_table.iterrows()
]

fig = go.Figure(
    layout=dict(
        title="Median F1-scores compared to baseline (green: better, purple: worse)",
        height=600,
        width=1000,
        yaxis={"autorange": "reversed"}
    ),
    data=go.Heatmap(
        x=[combination_table.columns.get_level_values(0), combination_table.columns.get_level_values(1)],
        y=combination_table.index,
        z=values,
        text=texts,
        texttemplate="%{text}",
        textfont={"size": 16},
        colorscale="greens",
        zmin=0,
        zmax=combination_table.max().max(),
        showscale=False,
    ),
)
fig.show()