# $RWD$ benchmark overview

Create an overview of all relations in $RWD$.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

data_path = "../../data"
results_path = "../../results"

rwd_data = {}
for file in filter(lambda f: f.endswith(".csv"), os.listdir(os.path.join(data_path, "rwd"))):
    rwd_data[file] = pd.read_csv(os.path.join(data_path, "rwd", file))
    rwd_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_data[file].columns
    ]

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

## table 3 - overview of relations in $RWD$

In [None]:
name_mapping = {
    "adult.csv": "adultData",
    "claims.csv": "claimsData",
    "dblp10k.csv": "dblpData",
    "hospital.csv": "hospitalData",
    "tax.csv": "taxData",
    "t_biocase_gathering_agent_r72738_c18.csv": "gathAgentData",
    "t_biocase_gathering_namedareas_r137711_c11.csv": "gathAreaData",
    "t_biocase_gathering_r90992_c35.csv": "gathData",
    "t_biocase_identification_highertaxon_r562959_c3.csv": "identTaxonData",
    "t_biocase_identification_r91800_c38.csv": "identData",
}
data = {}
for table, mapped_table in name_mapping.items():
    df = rwd_data[table]
    _results = rwd_results.query("table == @table")
    data[mapped_table] = {
        "rows": df.shape[0],
        "attributes": df.columns.size,
        "candFDs": _results.query("g3_prime >= .5").shape[0],
        "FDs": _results.query("(exact_fd == True) & (afd == True)").shape[0],
        "AFDs": _results.query("(exact_fd == False) & (afd == True)").shape[0],
    }
pd.DataFrame(data).T

## additional numbers reported in the paper

In [None]:
{
    "number of tables in RWD": rwd_results["table"].unique().size,
    "total number of column combinations": rwd_results.shape[0],
    "number of non exact FDs": rwd_results.query("(exact_fd == False)").shape[0],
    "number of canidate FDs in RWD": rwd_results.query("g3_prime >= .5").shape[0],
    "ground truth size": rwd_results.query("afd == True").shape[0],
    "number of exact FDs in ground truth": rwd_results.query(
        "(exact_fd == True) and (afd == True)"
    ).shape[0],
    "number of AFDs in ground truth": rwd_results.query(
        "(exact_fd == False) and (afd == True)"
    ).shape[0],
    "number of measures": len(afd_utils.measure_order),
}