# RWD \ RWD$^-$

Investigate the properties of all candidates in RWD that could not be calculated in time by $\text{RFI}^+$ and $\text{RFI}^{'+}$.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils
from synthetic_data import inferrence as syn_inferrence
from synthetic_data import utils as syn_utils

data_path = "../../data/rwd"
results_path = "../../results"

rwd_data = {}
for file in os.listdir(data_path):
    rwd_data[file] = pd.read_csv(os.path.join(data_path, file))
    rwd_data[file].columns = [afd_utils.clean_colname(c) for c in rwd_data[file].columns]

rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )

missing_candidates = rwd_results.query(
    "exact_fd == False"
).query(
    "reliable_fraction_of_information_plus.isna()"
).copy()

## Table 6, upper part - descriptive statistics of candidates missing at RFI

In [None]:
missing_candidates[afd_utils.measure_order].describe().loc[["mean", "std", "min", "50%", "max"], :].rename(columns=afd_utils.measure_map).T

In [None]:
# add LHS uniqueness and RHS skew to the results
missing_candidates[
    [
        "tuples",
        "lhs_cardinality",
        "rhs_cardinality",
        "lhs_dist_alpha",
        "lhs_dist_beta",
        "rhs_dist_alpha",
        "rhs_dist_beta",
        "noise",
    ]
] = missing_candidates.apply(
    lambda row: syn_inferrence.infer_settings(
        rwd_data[row["table"]].loc[:, [row["lhs"], row["rhs"]]],
        noise=False,
    ),
    axis="columns",
    result_type="expand",
)

missing_candidates["lhs_uniqueness"] = missing_candidates["lhs_cardinality"] / missing_candidates["tuples"]
missing_candidates["rhs_skewness"] = missing_candidates.apply(
    lambda row: syn_utils.beta_skewness(
        row["rhs_dist_alpha"], row["rhs_dist_beta"]
    ),
    axis="columns",
)

missing_candidates.loc[
    :, ["tuples", "lhs_uniqueness", "rhs_skewness"]
].describe().T.loc[
    :, ["mean", "std", "min", "50%", "max"]
]