In [None]:
import pandas as pd

In [None]:
all_decisions_df = pd.read_csv("results/all_decisions_df.csv")

In [None]:
complementarity = []
for dataset in all_decisions_df["dataset"].unique():
    _dataset_df = all_decisions_df.query("dataset == @dataset").copy()
    for task_scope in _dataset_df["task_scope"].unique():
        _task_scope_df = _dataset_df.query("task_scope == @task_scope").copy()
        if task_scope == "jaro":
            complementarity.append({
                "dataset": dataset,
                "task_scope": "FN",
                "experiment_run": 0,
                "count": _task_scope_df.query("benchmark & (decision != 'yes')").shape[0]
            })
        else:
            for experiment_run in _task_scope_df["experiment_run"].unique():
                _experiment_df = _task_scope_df.query("experiment_run == @experiment_run").copy()
                _jaro_df = _dataset_df.query("task_scope == 'jaro'").copy()
                # TODO!!!
                _jaro_FN = set([(p[0], p[1]) for p in _jaro_df.query("benchmark & (decision != 'yes')")[["source", "target"]].values])
                _llm_P = set([(p[0], p[1]) for p in _experiment_df.query("(decision == 'yes')")[["source", "target"]].values])
                complementarity.append({
                    "dataset": dataset,
                    "task_scope": task_scope,
                    "experiment_run": experiment_run,
                    "count": len(_llm_P & _jaro_FN),
                })

complementarity_df = pd.DataFrame(complementarity)

In [None]:
complementarity_table = pd.pivot(
    complementarity_df.groupby(["dataset", "task_scope"])["count"].median().reset_index(),
    index="dataset",
    columns="task_scope",
    values="count",
)[["FN", "1-to-1", "1-to-n", "n-to-1", "n-to-n"]].astype(int)
complementarity_table

In [None]:
import numpy as np
import plotly.graph_objects as go

values = []
for row in complementarity_table.sort_index(ascending=False).values:
    value_row = []
    for i, value in enumerate(row):
        new_value = 0
        if i > 0 and row[0] > 0:
            new_value = value / row[0]
        value_row.append(new_value)
    values.append(value_row)

fig = go.Figure(
    data=go.Heatmap(
        x=["baseline FN", "1-to-1", "1-to-N", "N-to-1", "N-to-M"],
        y=complementarity_table.sort_index(ascending=False).index,
        z=values,
        text=complementarity_table.sort_index(ascending=False).values,
        texttemplate="%{text:d}",
        textfont={"size": 16},
        colorscale="PRGn",
        zmin=-1.0,
        zmax=1.0,
        showscale=False,
    ),
    layout=dict(
        title="Baseline False Negatives (FN) retrieved per task scope (first column shows the baseline FN count)",
        height=600,
        width=1000,
    ),
)
fig.show()