# Rank @ max Recall

Investigate the rank of each measure for which it reaches max recall for a table.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils


rwd_results = pd.DataFrame()
for file in filter(lambda f: f.startswith("rwd_results_"), os.listdir("../../results")):
    df = pd.read_csv(os.path.join("../../results", file))
    rwd_results = pd.concat([rwd_results, df])
rwd_results = rwd_results.reset_index(drop=True)
rwd_minus = rwd_results[rwd_results[afd_utils.measure_order].notna().all(axis="columns")].query("exact_fd == False").copy() 

## Figure 2 (b) - rank at max recall per measure per relation

In [None]:
name_mapping = {
    "adult.csv": "adultData",
    "claims.csv": "claimsData",
    "dblp10k.csv": "dblpData",
    "hospital.csv": "hospitalData",
    "tax.csv": "taxData",
    "t_biocase_gathering_agent_r72738_c18.csv": "gathAgentData",
    "t_biocase_gathering_namedareas_r137711_c11.csv": "gathAreaData",
    "t_biocase_gathering_r90992_c35.csv": "gathData",
    "t_biocase_identification_highertaxon_r562959_c3.csv": "identTaxonData",
    "t_biocase_identification_r91800_c38.csv": "identData",
}

rankAtMaxRecall = []
eligible_tables = []
afds_count = {
    "measure": "AFD(R)"
}

for table in name_mapping.keys():
    results_table = rwd_minus.query("table == @table").copy()
    afds = results_table.loc[:, "afd"]
    if afds.any():
        eligible_tables.append(table)
        afds_count[table] = afds.value_counts()[True]
rankAtMaxRecall.append(afds_count)

for measure in afd_utils.measure_order:
    ranks = {
        "measure": measure,
    }
    for table in eligible_tables:
        results_table = rwd_minus.query("table == @table").copy()
        ranked_table = results_table.loc[:, [measure, "afd"]].sort_values(
            by=[measure, "afd"],
            ascending=[False, True],
        ).reset_index(
            drop=True,
        )
        ranks[table] = ranked_table[ranked_table["afd"]].iloc[-1].name + 1
    rankAtMaxRecall.append(ranks)

rankAtMaxRecall = pd.DataFrame(rankAtMaxRecall).set_index("measure")
rankAtMaxRecall.rename(index=afd_utils.measure_map, columns=name_mapping)