# Rank @ max recall for RWD$^e$

Investigate the rank of each measure for which it reaches max recall for a table in RWD$^e$.

In [None]:
import os
import sys

import numpy as np
import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

name_mapping = {
    "adult.csv": "adultData",
    "claims.csv": "claimsData",
    "dblp10k.csv": "dblpData",
    "hospital.csv": "hospitalData",
    "tax.csv": "taxData",
    "t_biocase_gathering_agent_r72738_c18.csv": "gathAgentData",
    "t_biocase_gathering_namedareas_r137711_c11.csv": "gathAreaData",
    "t_biocase_gathering_r90992_c35.csv": "gathData",
    "t_biocase_identification_highertaxon_r562959_c3.csv": "identTaxonData",
    "t_biocase_identification_r91800_c38.csv": "identData",
}

rwd_e_results = pd.DataFrame()

for file in filter(
    lambda f: f.startswith("rwd_e_results_"), os.listdir("../../results")
):
    df = pd.read_csv(os.path.join("../../results", file))
    rwd_e_results = pd.concat([rwd_e_results, df])
rwd_e_results = rwd_e_results.reset_index(drop=True)
rwd_e_results["polluted_afd"] = rwd_e_results["afd"] | rwd_e_results["fd_polluted"]

## table 7 - percentages of tables won per noise type

In [None]:
rankAtMaxRecall = []
for noise_type in ("copy", "bogus", "typo"):
    for noise_level in (0.01, 0.02, 0.05, 0.1):
        results_typeLevel = rwd_e_results.query(
            "(exact_fd == False) & (noise_type == @noise_type) & (noise_level == @noise_level)"
        ).copy()
        results_typeLevel = results_typeLevel[
            results_typeLevel[afd_utils.measure_order].notna().all(axis="columns")
        ]
        for table in results_typeLevel.loc[
            results_typeLevel["polluted_afd"], "table"
        ].unique():
            results_table = results_typeLevel.query("table == @table").copy()
            afds_table = results_table[results_table["polluted_afd"]].copy()
            measure_ranks = {}
            for measure in afd_utils.measure_order:
                measure_ranks[measure] = results_table[
                    results_table[measure] >= afds_table[measure].min()
                ].shape[0]
            measure_ranks = pd.Series(measure_ranks)
            for measure in measure_ranks[measure_ranks == measure_ranks.min()].index:
                rankAtMaxRecall.append({
                    "table": table,
                    "noise_type": noise_type,
                    "noise_level": noise_level,
                    "measure": measure,
                    "win": 1,
                })

rankAtMaxRecall = pd.DataFrame(rankAtMaxRecall)

(
    pd.pivot_table(
        rankAtMaxRecall,
        values="win",
        columns="noise_type",
        index="measure",
        aggfunc=np.sum,
    ).loc[
        afd_utils.measure_order, ["copy", "bogus", "typo"]
    ].fillna(0.0).rename(index=afd_utils.measure_map)
) / (
    rankAtMaxRecall.groupby(
        by=["noise_type","noise_level","table"]
    ).count().groupby(by=["noise_type"]).count().iloc[:, 1].loc[["copy", "bogus", "typo"]]
)