In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

In [None]:
benchmark = pd.read_csv("benchmark/ground_truth.csv")
for side in ("source", "target"):
    benchmark[[f"{side}_schema", f"{side}_relation", f"{side}_attribute"]] = benchmark[side].str.split(".", expand=True)
    benchmark[side] = benchmark[side].str.lower()
benchmark["benchmark"] = True

gpt35_results = pd.read_csv("results/gpt35_results.csv")
gpt4_results = pd.read_csv("results/gpt4_results.csv")
baseline_results = pd.read_csv("results/baseline_results.csv")

In [None]:
def abbreviate(source: str, target: str) -> str:
    return {
        ("patients", "person"): "PaPe",
        ("admissions", "visit_occurrence"): "AdVO",
        ("prescriptions", "drug_exposure"): "PrDE",
        ("admissions", "condition_occurrence"): "AdCO",
        ("diagnoses_icd", "condition_occurrence"): "DiCO",
        ("labevents", "measurement"): "LaMe",
        ("admissions", "visit_detail"): "AdVD",
        ("services", "visit_detail"): "SeVD",
        ("transfers", "visit_detail"): "TrVD",
    }[(source.lower(), target.lower())]


gpt35_results["dataset"] = gpt35_results.apply(lambda row: abbreviate(row["source_relation"], row["target_relation"]), axis="columns")
gpt35_results["experiment_run"] = gpt35_results["decision_index"] // 3
gpt35_results["model"] = "GPT-3.5"
gpt4_results["dataset"] = gpt4_results.apply(lambda row: abbreviate(row["source_relation"], row["target_relation"]), axis="columns")
gpt4_results["experiment_run"] = gpt4_results["decision_index"] // 3
gpt4_results["model"] = "GPT-4"
baseline_results["dataset"] = baseline_results.apply(lambda row: abbreviate(row["source_relation"], row["target_relation"]), axis="columns")

In [None]:
llm_results = pd.concat((gpt35_results, gpt4_results), axis="index")

majority_vote_df = pd.pivot(
    llm_results.groupby(by=["model", "task_scope", "experiment_run", "dataset", "source", "target", "decision"]).count()["benchmark"].reset_index(),
    index=["model", "task_scope", "experiment_run", "dataset", "source", "target"],
    columns=["decision"],
).reset_index()

majority_vote_df.columns = ["model", "task_scope", "experiment_run", "dataset", "source", "target"] + majority_vote_df.columns.levels[1][0:3].tolist()
for vote in ["no", "unknown", "yes"]:  
    majority_vote_df[vote] = majority_vote_df[vote].fillna(0)

majority_vote_df["decision"] = "unknown"
majority_vote_df["decision"] = majority_vote_df["decision"].mask(
    majority_vote_df["no"] >= 2,
    other="no",
).mask(
    majority_vote_df["yes"] >= 2,
    other="yes",
)

majority_vote_df = majority_vote_df.merge(benchmark[["source", "target", "benchmark"]], on=["source", "target"], how="left").copy()
majority_vote_df["benchmark"] = majority_vote_df["benchmark"].fillna(False)

In [None]:
baseline_results["decision"] = "unknown"

for dataset in baseline_results["dataset"].unique():
    _df = baseline_results.query("dataset == @dataset")
    all_f1_scores = []
    for threshold in _df.query("benchmark")["n-gram"].values:
        p, r, f1, _ = precision_recall_fscore_support(
            _df["benchmark"],
            _df["n-gram"] >= threshold,
            average="binary",
            pos_label=True,
            zero_division=0.0,
        )
        all_f1_scores.append({
            "threshold": threshold,
            "precision": p,
            "recall": r,
            "f1-score": f1,
        })
    all_f1_scores = pd.DataFrame(all_f1_scores)
    best_f1 = all_f1_scores.sort_values(["f1-score", "recall"], ascending=[False, False]).iloc[0]
    #best_f1 = all_f1_scores.loc[all_f1_scores["f1-score"].argmax()]
    baseline_results["decision"] = baseline_results["decision"].mask(
        (baseline_results["dataset"] == dataset) & (baseline_results["n-gram"] >= best_f1["threshold"]),
        other="yes",
    ).mask(
        (baseline_results["dataset"] == dataset) & (baseline_results["n-gram"] < best_f1["threshold"]),
        other="no",
    )

baseline_results["model"] = ""
baseline_results["task_scope"] = "n-gram"
baseline_results["experiment_run"] = 0

In [None]:
all_decisions_df = pd.concat((majority_vote_df, baseline_results), axis="index")[
    ["model", "task_scope", "experiment_run", "dataset", "source", "target", "decision", "benchmark"]
]
all_decisions_df.to_csv("results/all_decisions_df.csv", index=False)
all_decisions_df