# Create PR-AUC table

## Load the data
Load both, the datasets themselves as well as the results of the AFD measures.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

data_path = "../../data"
gt_path = "../../data/ground_truth.csv"
results_path = "../../results"

rwd_data = {}
for file in filter(lambda f: f.endswith(".csv"), os.listdir(os.path.join(data_path, "rwd"))):
    rwd_data[file] = pd.read_csv(os.path.join(data_path, "rwd", file))
    rwd_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_data[file].columns
    ]

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

rwd_polluted_data = {}
for file in filter(lambda f: f.endswith(".csv"), os.listdir(os.path.join(data_path, "rwd_e"))):
    rwd_polluted_data[file] = pd.read_csv(os.path.join(data_path, "rwd_e", file))
    rwd_polluted_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_polluted_data[file].columns
    ]

rwd_polluted_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_e_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_polluted_results = pd.concat(
        [rwd_polluted_results, pd.read_csv(os.path.join(results_path, file))]
    )

## Generate AUC table

Generate the AUC table for both RWD and RWD$^e$.

In [None]:
from sklearn.metrics import auc, precision_recall_curve

from afd_measures import utils as afd_utils

_data = {}
subsets = {
    "RWD": rwd_results.query("exact_fd == False"),
}
for noise_type in ("copy", "bogus", "typo"):
    for noise_level in (0.01, 0.02, 0.05, 0.1):
        _df = rwd_polluted_results.query(
            "(exact_fd == False) & (noise_level == @noise_level) & (noise_type == @noise_type)"
        ).copy()
        # add the polluted FDs to the ground truth
        _df['afd'] = _df['afd'] | _df['fd_polluted']
        subsets[
            (f"{noise_type} ${int(noise_level*100)}$")
        ] = _df

for subset_name, subset_df in subsets.items():
    _data[subset_name] = {}
    for measure in afd_utils.measure_order:
        _df = subset_df[subset_df[measure].notna()].copy()
        precision, recall, _ = precision_recall_curve(
            _df.loc[:, "afd"], _df.loc[:, measure]
        )
        _auc = auc(recall, precision)
        _data[subset_name][afd_utils.measure_map[measure]] = round(_auc, 3)

df = pd.DataFrame(_data)
str_df = pd.DataFrame(index=df.index)
for c in df.columns:
    str_df[c] = df[c].astype(str)
    _first, _second = df[c].sort_values(ascending=False).index[0:2]
    for _both in (_first, _second):
        str_df.loc[_both, c] = f"\\textbf{{{str_df.loc[_both, c]}}}"
    str_df.loc[_first, c] = f"\\underline{{{str_df.loc[_first, c]}}}"

str_df.to_csv("../../paper/table3_auc_overview.csv", index_label="measure")