# Create PR-AUC table

## Load the data
Load both, the datasets themselves as well as the results of the AFD measures.

In [1]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

results_path = "../../results"

rwd_data = {}
for file in filter(lambda f: f.endswith(".csv"), os.listdir(os.path.join(data_path, "rwd"))):
    rwd_data[file] = pd.read_csv(os.path.join(data_path, "rwd", file))
    rwd_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_data[file].columns
    ]

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

rwd_polluted_data = {}
for file in filter(lambda f: f.endswith(".csv"), os.listdir(os.path.join(data_path, "rwd_e"))):
    rwd_polluted_data[file] = pd.read_csv(os.path.join(data_path, "rwd_e", file))
    rwd_polluted_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_polluted_data[file].columns
    ]

rwd_polluted_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_e_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_polluted_results = pd.concat(
        [rwd_polluted_results, pd.read_csv(os.path.join(results_path, file))]
    )

## Generate AUC table

Generate the AUC table for both RWD and RWD$^e$.

In [2]:
from sklearn.metrics import auc, precision_recall_curve

from afd_measures import utils as afd_utils

_data = {}
subsets = {
    "RWD": rwd_results.query("exact_fd == False"),
}

for noise_type in ("copy", "bogus", "typo"):
    for noise_level in (0.01, 0.02, 0.05, 0.1):
        _df = rwd_polluted_results.query(
            "(exact_fd == False) & (noise_level == @noise_level) & (noise_type == @noise_type)"
        ).copy()
        # add the polluted FDs to the ground truth
        _df['afd'] = _df['afd'] | _df['fd_polluted']
        subsets[
            (f"{noise_type} ${int(noise_level*100)}$")
        ] = _df

for subset_name, subset_df in subsets.items():
    _data[subset_name] = {}
    for measure in afd_utils.measure_order:
        _df = subset_df[subset_df[measure].notna()].copy()
        precision, recall, _ = precision_recall_curve(
            _df.loc[:, "afd"], _df.loc[:, measure]
        )
        _auc = auc(recall, precision)
        _data[subset_name][afd_utils.measure_map[measure]] = round(_auc, 3)

df = pd.DataFrame(_data)
str_df = pd.DataFrame(index=df.index)
for c in df.columns:
    str_df[c] = df[c].astype(str)
    _first, _second = df[c].sort_values(ascending=False).index[0:2]
    for _both in (_first, _second):
        str_df.loc[_both, c] = f"\\textbf{{{str_df.loc[_both, c]}}}"
    str_df.loc[_first, c] = f"\\underline{{{str_df.loc[_first, c]}}}"

str_df.to_csv("../../paper/table3_auc_overview.csv", index_label="measure")
df

Unnamed: 0,RWD,copy $1$,copy $2$,copy $5$,copy $10$,bogus $1$,bogus $2$,bogus $5$,bogus $10$,typo $1$,typo $2$,typo $5$,typo $10$
$\rho$,0.411,0.39,0.26,0.177,0.091,0.268,0.184,0.108,0.062,0.307,0.223,0.142,0.081
$g_2$,0.497,0.355,0.272,0.206,0.127,0.264,0.232,0.165,0.111,0.269,0.238,0.165,0.111
$g_3$,0.669,0.63,0.466,0.331,0.225,0.538,0.375,0.257,0.196,0.54,0.377,0.257,0.195
$g'_3$,0.901,0.601,0.483,0.356,0.275,0.598,0.441,0.281,0.241,0.581,0.458,0.283,0.241
$g_1$,0.399,0.363,0.312,0.247,0.174,0.325,0.295,0.215,0.148,0.32,0.296,0.218,0.149
$g'_1$,0.398,0.362,0.312,0.247,0.173,0.325,0.295,0.215,0.148,0.32,0.296,0.217,0.149
$\pdep$,0.642,0.548,0.413,0.292,0.195,0.463,0.354,0.236,0.169,0.461,0.359,0.237,0.17
$\tau$,0.623,0.662,0.503,0.34,0.212,0.502,0.384,0.251,0.187,0.506,0.391,0.254,0.186
$\mu'$,0.946,0.78,0.653,0.553,0.38,0.661,0.523,0.4,0.308,0.662,0.542,0.402,0.309
\FI,0.396,0.49,0.38,0.277,0.182,0.407,0.337,0.223,0.162,0.414,0.345,0.227,0.163


In [14]:
def highlight_top2(s, props=''):
    _first, _second = s.sort_values(ascending=False).iloc[0:2]
    return s.mask(
        s == _first,
        "text-decoration: underline;font-weight: bold"
    ).mask(
        s == _second,
        "font-weight: bold"
    ).where(
        (s == _first) | (s == _second),
        None
    )

df.style.apply(highlight_top2, axis="index")

Unnamed: 0,RWD,copy $1$,copy $2$,copy $5$,copy $10$,bogus $1$,bogus $2$,bogus $5$,bogus $10$,typo $1$,typo $2$,typo $5$,typo $10$
$\rho$,0.411,0.39,0.26,0.177,0.091,0.268,0.184,0.108,0.062,0.307,0.223,0.142,0.081
$g_2$,0.497,0.355,0.272,0.206,0.127,0.264,0.232,0.165,0.111,0.269,0.238,0.165,0.111
$g_3$,0.669,0.63,0.466,0.331,0.225,0.538,0.375,0.257,0.196,0.54,0.377,0.257,0.195
$g'_3$,0.901,0.601,0.483,0.356,0.275,0.598,0.441,0.281,0.241,0.581,0.458,0.283,0.241
$g_1$,0.399,0.363,0.312,0.247,0.174,0.325,0.295,0.215,0.148,0.32,0.296,0.218,0.149
$g'_1$,0.398,0.362,0.312,0.247,0.173,0.325,0.295,0.215,0.148,0.32,0.296,0.217,0.149
$\pdep$,0.642,0.548,0.413,0.292,0.195,0.463,0.354,0.236,0.169,0.461,0.359,0.237,0.17
$\tau$,0.623,0.662,0.503,0.34,0.212,0.502,0.384,0.251,0.187,0.506,0.391,0.254,0.186
$\mu'$,0.946,0.78,0.653,0.553,0.38,0.661,0.523,0.4,0.308,0.662,0.542,0.402,0.309
\FI,0.396,0.49,0.38,0.277,0.182,0.407,0.337,0.223,0.162,0.414,0.345,0.227,0.163
