# PR-AUCs of RWD

Get and format the data for Table 5, AUCs on RWD; used to compare against RWD$^-$ discussed in the main part of the paper.

In [None]:
import os
import sys

import pandas as pd
from sklearn.metrics import auc, precision_recall_curve

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

results_path = "../../results"

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

## Figure 5 - PR-AUCs per relation on RWD

In [None]:
name_mapping = {
    "adult.csv": "adultData",
    "claims.csv": "claimsData",
    "dblp10k.csv": "dblpData",
    "hospital.csv": "hospitalData",
    "tax.csv": "taxData",
    "t_biocase_gathering_agent_r72738_c18.csv": "gathAgentData",
    "t_biocase_gathering_namedareas_r137711_c11.csv": "gathAreaData",
    "t_biocase_gathering_r90992_c35.csv": "gathData",
    "t_biocase_identification_highertaxon_r562959_c3.csv": "identTaxonData",
    "t_biocase_identification_r91800_c38.csv": "identData",
}

table_aucs = []
for measure in afd_utils.measure_order:
    aucs = { "measure": measure }
    measure_results = rwd_results.query(f"(exact_fd == False) & {measure}.notna()").copy()
    precision, recall, _ = precision_recall_curve(
        y_true = measure_results.loc[:, "afd"], probas_pred = measure_results.loc[:, measure]
    )
    _auc = auc(recall, precision)
    aucs["RWD"] = round(_auc, 3)
    for table in name_mapping.keys():
        _df = measure_results.query(f"table == @table").copy()
        if not _df["afd"].any():
            aucs[table] = 1.0
        else:
            precision, recall, _ = precision_recall_curve(
                y_true = _df.loc[:, "afd"], probas_pred = _df.loc[:, measure]
            )
            _auc = auc(recall, precision)
            aucs[table] = round(_auc, 3)
    table_aucs.append(aucs)

table_aucs = pd.DataFrame(table_aucs).set_index("measure", drop=True).rename(index=afd_utils.measure_map, columns=name_mapping)
max_values = table_aucs.iloc[:, 1:].max()
table_aucs["best %"] = (table_aucs.iloc[:, 1:].apply(lambda row: (row - max_values == 0).value_counts()[True], axis="columns") / max_values.count() * 100).astype(int)
table_aucs