# PR-AUCs of RWD$^e$

Get and format the data for Table 8, AUC on RWD$^e$.

In [None]:
import os
import sys

import pandas as pd
from sklearn.metrics import auc, precision_recall_curve

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

results_path = "../../results"

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

rwd_polluted_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_e_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_polluted_results = pd.concat(
        [rwd_polluted_results, pd.read_csv(os.path.join(results_path, file))]
    )

## Table 8 - PR-AUCs of RWD$^e$

In [None]:
subsets = {
    "rwdminus": rwd_results[rwd_results[afd_utils.measure_order].notna().all(axis="columns")].query("exact_fd == False"),
}

data = {}
rwd_e_n = {}
rwd_e_afd = {}
for noise_type in ("copy", "bogus", "typo"):
    for noise_level in (0.01, 0.02, 0.05, 0.1):
        _df = rwd_polluted_results[
            rwd_polluted_results[afd_utils.measure_order].notna().all(axis="columns")
        ].query(
            "(exact_fd == False) & (noise_level == @noise_level) & (noise_type == @noise_type)"
        ).copy()
        # add the polluted FDs to the ground truth
        _df['afd'] = _df['afd'] | _df['fd_polluted']
        subsets[
            (f"{noise_type}{int(noise_level*100)}")
        ] = _df
        rwd_e_n[f"{noise_type}{int(noise_level*100)}"] = _df.shape[0]
        rwd_e_afd[f"{noise_type}{int(noise_level*100)}"] = _df.afd.value_counts()[True] if _df.afd.any() else 0

for subset_name, subset_df in subsets.items():
    data[subset_name] = {
        "n": subset_df.shape[0],
        "AFD(R)": subset_df.afd.value_counts()[True],
    }
    for measure in afd_utils.measure_order:
        _df = subset_df[subset_df[measure].notna()].copy()
        precision, recall, _ = precision_recall_curve(
            _df.loc[:, "afd"], _df.loc[:, measure]
        )
        _auc = auc(recall, precision)
        data[subset_name][measure] = round(_auc, 3)

rwdminus_e_aucs = pd.DataFrame(data)
rwdminus_e_aucs.rename(index=afd_utils.measure_map)