# PR-AUCs of RWD$^-$

Get and format the data for Table 1 (AUC on RWD) and Figure 2 (a) of the paper.

In [None]:
import os
import sys

import pandas as pd
from sklearn.metrics import auc, precision_recall_curve

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

results_path = "../../results"

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

rwd_minus = rwd_results[rwd_results[afd_utils.measure_order].notna().all(axis="columns")].query("exact_fd == False").copy()

## Table 1 - PR-AUCs of RWD$^-$

In [None]:
rwd_aucs = {}
for measure in afd_utils.measure_order:
    precision, recall, _ = precision_recall_curve(
        rwd_minus.loc[:, "afd"], rwd_minus.loc[:, measure]
    )
    _auc = auc(recall, precision)
    rwd_aucs[measure] = _auc

pd.Series(rwd_aucs)

## Figure 2 (a) - PR-AUCs per relation

In [None]:
table_aucs = []
name_mapping = {
    "adult.csv": "adultData",
    "claims.csv": "claimsData",
    "dblp10k.csv": "dblpData",
    "hospital.csv": "hospitalData",
    "tax.csv": "taxData",
    "t_biocase_gathering_agent_r72738_c18.csv": "gathAgentData",
    "t_biocase_gathering_namedareas_r137711_c11.csv": "gathAreaData",
    "t_biocase_gathering_r90992_c35.csv": "gathData",
    "t_biocase_identification_highertaxon_r562959_c3.csv": "identTaxonData",
    "t_biocase_identification_r91800_c38.csv": "identData",
}

for measure in afd_utils.measure_order:
    aucs = { "measure": measure }
    for table in name_mapping.keys():
        _df = rwd_minus.query("table == @table").copy()
        if not _df["afd"].any():
            aucs[table] = 1.0
        else:
            precision, recall, _ = precision_recall_curve(
                y_true = _df.loc[:, "afd"], probas_pred = _df.loc[:, measure]
            )
            _auc = auc(recall, precision)
            aucs[table] = round(_auc, 3)
    table_aucs.append(aucs)
for i in range(len(table_aucs)):
    m = table_aucs[i]["measure"]
    table_aucs[i]["\\rwdminus"] = rwd_aucs[m]

table_aucs = pd.DataFrame(table_aucs).set_index("measure", drop=True).rename(index=afd_utils.measure_map, columns=name_mapping)
table_aucs = table_aucs.iloc[:,[10,0,1,2,3,4,5,6,7,8,9]].copy() * 100
max_values = table_aucs.iloc[:, 1:].max()
table_aucs["best %"] = (table_aucs.iloc[:, 1:].apply(lambda row: (row - max_values == 0).value_counts()[True], axis="columns") / max_values.count() * 100).astype(int)
table_aucs