# Create winning numbers table for RWD and RWDe

## Setup

Load the .csv-files containing the results of the measures on the RWD and set some configuration variables.

In [None]:
import os
import sys
import pandas as pd
sys.path.append(os.path.join(os.getcwd(), '..'))
from afd_measures import utils as afd_utils

rwd_tables = {'adult': "{\\adultData}", 'claims': "{\claimsData}",
 'dblp10k': "{\dblpData}", 'fars': "{\\farsData}", 'hospital': "{\hospitalData}", 'tax': "{\\taxData}", 't_biocase_gathering_agent_r72738_c18': "{\gathAgentData}",
 't_biocase_gathering_namedareas_r137711_c11': "{\gathAreaData}",
 't_biocase_gathering_r90992_c35': "{\gathData}", 't_biocase_identification_highertaxon_r562959_c3': "{\identTaxonData}", 't_biocase_identification_r91800_c38': "{\identData}"}


rwd_e_tables = {'adult': "{\\adultData}", 'claims': "{\claimsData}",
 'dblp10k': "{\dblpData}", 'hospital': "{\hospitalData}", 'tax': "{\\taxData}", 't_biocase_gathering_agent_r72738_c18': "{\gathAgentData}",
 't_biocase_gathering_namedareas_r137711_c11': "{\gathAreaData}", 't_biocase_identification_r91800_c38': "{\identData}"}

measure_map = {
    'mu_corrected': '$\\mu\'$',
    "g3_giannella": "$g'_3$",
    "g3": "$g_3$",
    "pdep": "$\\pdep$",
    "tau": "$\\tau$",
    "rho": "$\\rho$",
    "g2": "$g_2$",
    "fraction_of_information": "\\FI",
    "g1_prime": "$g'_1$",
    "g1": "$g_1$",
    "rfi_corrected": "\\RFI'",
    "smoothed_fraction_of_information_a0.5": "\\SFI",
}

measure_order = ["rho", "g2", "g3", "g3_giannella", "g1", "g1_prime", "pdep", "tau", "mu_corrected", "fraction_of_information", "rfi_corrected", "smoothed_fraction_of_information_a0.5"]


noise_levels = ["0.01", "0.02", "0.05", "0.1"]

rwd_filename = "../../results/r@mr_rwd.csv"
rwd_e_typo_filename = "../../results/r@mr_rwdtypo.csv"
rwd_e_bogus_filename = "../../results/r@mr_rwdbogus.csv"
rwd_e_copy_filename = "../../results/r@mr_rwdcopy.csv"

## Generate rank @ maximum recall table

In [None]:
def rwd_r_at_mr(table, outputfile, tables, measure_order, measure_map):

    df = pd.read_csv("../../results/{}.csv".format(table))
    afds = df.loc[(df.exact_fd == False) & (df.trivial_fd == False)]
    afds = afd_utils.add_ground_truth(table, afds)
    gt_len = len(afds[afds["gt"] == True].index)
    
    if gt_len == 0:
        return
    
    r_at_mr_per_measures = [str(gt_len)]
    
    for measure in measure_order:

        sorted_afds = afds.sort_values(by=[measure, 'gt'], ascending=[False, True])
        
        for k in range(1, len(afds.index)+2):
            topk_of_measure = sorted_afds.head(k)
            k_recall = len(topk_of_measure[topk_of_measure["gt"] == True].index)/gt_len
            if k_recall < 1.:
                continue
            else:
                r_at_mr_per_measures.append(k)
                break
            
    outputfile.write("{},{}\n".format(tables[table], ",".join([str(i) for i in r_at_mr_per_measures])))

def rwd_e_r_at_mr(table, outputfile, tables, measure_order, measure_map, noise_levels, noise_type):

    r_at_mr_per_noiselevel = []
    r_at_mr_per_measure = []

    for noise in noise_levels:
        df = pd.read_csv("../../results/{}.csv".format("polluted_{}_{}_{}".format(noise_type, noise, table)))
        afds = df.loc[(df.exact_fd == False) & ((df.polluted_fd == True) | ((df.afd == True) & (df.polluted_lhs == False) & (df.polluted_rhs == False)))]
        afds = afd_utils.add_ground_truth(table, afds)
        gt_len = len(afds[afds["gt"] == True].index)

        r_at_mr_per_noiselevel.append(str(gt_len))

    r_at_mr_per_measure.append(r_at_mr_per_noiselevel)

    for measure in measure_order:
        
        r_at_mr_per_noiselevel = []

        for noise in noise_levels:
            scores = "polluted_{}_{}_{}".format(noise_type, noise, table)

            df = pd.read_csv("../../results/{}.csv".format(scores))
            afds = df[df["exact_fd"] == False]
            afds = afds[afds["trivial_fd"] == False]
            afds = afd_utils.add_ground_truth(table, afds)
            gt_len = len(afds[afds["gt"] == True].index)
            
            if gt_len == 0:
                return

            sorted_afds = afds.sort_values(by=[measure, 'gt'], ascending=[False, True])
                                    
            for k in range(1, len(afds.index)+2):
                top_measure = sorted_afds.head(k)
                k_recall = len(top_measure[top_measure["gt"] == True].index)/gt_len
                if k_recall < 1.:
                    continue
                else:
                    r_at_mr_per_noiselevel.append(k)
                    break
                    
        r_at_mr_per_noiselevel = [str(i) for i in r_at_mr_per_noiselevel]
        r_at_mr_per_measure.append(r_at_mr_per_noiselevel)

    outputstring = ",".join(["|".join(recall) for recall in r_at_mr_per_measure])
    outputfile.write("{},{}\n".format(tables[table], outputstring))


rwd_output = open(rwd_filename, "w+")
rwd_output.write("Relation $R$,\#$\designAFD(R)$,{}\n".format(",".join([measure_map[measure] for measure in measure_order])))

rwd_e_typo_output = open(rwd_e_typo_filename, "w+")
rwd_e_typo_output.write("Relation $R$,\#$\designAFD(R)$,{}\n".format(",".join([measure_map[measure] for measure in measure_order])))

rwd_e_bogus_output = open(rwd_e_bogus_filename, "w+")
rwd_e_bogus_output.write("Relation $R$,\#$\designAFD(R)$,{}\n".format(",".join([measure_map[measure] for measure in measure_order])))

rwd_e_copy_output = open(rwd_e_copy_filename, "w+")
rwd_e_copy_output.write("Relation $R$,\#$\designAFD(R)$,{}\n".format(",".join([measure_map[measure] for measure in measure_order])))


#TODO: don't use keys -> unordered
for table in rwd_tables.keys():
    rwd_r_at_mr(table, rwd_output, rwd_tables, measure_order, measure_map)

for table in rwd_e_tables.keys():
    rwd_e_r_at_mr(table, rwd_e_copy_output, rwd_e_tables, measure_order, measure_map, noise_levels, "copy")
    rwd_e_r_at_mr(table, rwd_e_typo_output, rwd_e_tables, measure_order, measure_map, noise_levels, "typo")
    rwd_e_r_at_mr(table, rwd_e_bogus_output, rwd_e_tables, measure_order, measure_map, noise_levels, "bogus")

rwd_output.close()
rwd_e_typo_output.close()
rwd_e_bogus_output.close()
rwd_e_copy_output.close()


rwd_output = open(rwd_filename, "r+")
rwd_e_typo_output = open(rwd_e_typo_filename, "r+")
rwd_e_bogus_output = open(rwd_e_bogus_filename, "r+")
rwd_e_copy_output = open(rwd_e_copy_filename, "r+")

#Transpose the tables
pd.read_csv(rwd_output, index_col=0).T.to_csv(rwd_filename)
pd.read_csv(rwd_e_copy_output, index_col=0).T.to_csv(rwd_e_copy_filename)
pd.read_csv(rwd_e_typo_output, index_col=0).T.to_csv(rwd_e_typo_filename)
pd.read_csv(rwd_e_bogus_output, index_col=0).T.to_csv(rwd_e_bogus_filename)

rwd_output.close()
rwd_e_typo_output.close()
rwd_e_bogus_output.close()
rwd_e_copy_output.close()