# Challenging relations

Investigate challenging relations with regard to LHS uniqueness and RHS skew.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils
from synthetic_data import inferrence as syn_inferrence
from synthetic_data import utils as syn_utils

difficult_relations = [
    "dblp10k.csv",  # R3
    "t_biocase_gathering_agent_r72738_c18.csv",  # R6
]
R = {
    3: difficult_relations[0],
    6: difficult_relations[1],
}

data_path = "../../data/rwd"
results_path = "../../results"
    
rwd_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("rwd_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    rwd_results = pd.concat(
        [rwd_results, pd.read_csv(os.path.join(results_path, file))]
    )
rwd_minus = rwd_results[rwd_results[afd_utils.measure_order].notna().all(axis="columns")].query("exact_fd == False").copy()

difficult_data = {}
difficult_results = {}
for difficult_relation in difficult_relations:
    difficult_data[difficult_relation] = pd.read_csv(
        os.path.join(data_path, difficult_relation)
    )
    difficult_data[difficult_relation].columns = [
        afd_utils.clean_colname(c) for c in difficult_data[difficult_relation].columns
    ]
    df = rwd_minus.query("table == @difficult_relation")
    difficult_results[difficult_relation] = df.copy()

In [None]:
# add LHS uniqueness and RHS skew to the results
for table, df in difficult_results.items():
    difficult_results[table][
        [
            "tuples",
            "lhs_cardinality",
            "rhs_cardinality",
            "lhs_dist_alpha",
            "lhs_dist_beta",
            "rhs_dist_alpha",
            "rhs_dist_beta",
            "noise",
        ]
    ] = df.apply(
        lambda row: syn_inferrence.infer_settings(
            difficult_data[table].loc[:, [row["lhs"], row["rhs"]]],
            noise=False,
        ),
        axis="columns",
        result_type="expand",
    )
    
    difficult_results[table]["lhs_uniqueness"] = df["lhs_cardinality"] / df["tuples"]
    difficult_results[table]["rhs_skewness"] = df.apply(
        lambda row: syn_utils.beta_skewness(
            row["rhs_dist_alpha"], row["rhs_dist_beta"]
        ),
        axis="columns",
    )

In [None]:
mislabelled = []
for table, df in difficult_results.items():
    for measure in afd_utils.measure_order:
        sorted_df = (
            difficult_results[table]
            .sort_values([measure, "afd"], ascending=[False, True])
            .reset_index()
            .copy()
        )
        max_afd = sorted_df[sorted_df.afd].index.max()
        mislabelled_candidates = sorted_df.iloc[: max_afd + 1, :].query("afd == False").copy()
        mislabelled_byMeasure = {
            "table": table,
            "measure": measure,
            "max_afd_rank": max_afd + 1,
        }
        for prop in ("lhs_uniqueness", "rhs_skewness"):
            mislabelled_desc = mislabelled_candidates[prop].describe()
            for column in mislabelled_desc.index:
                mislabelled_byMeasure[f"{prop}_{column}"] = mislabelled_desc[column]
        mislabelled.append(mislabelled_byMeasure)

mislabelled = pd.DataFrame(mislabelled)

## Figure 2 (c), upper part - means of mislabelled candidates in R3 and R6

In [None]:
table_map = {
    v: f"R{k}" for k, v in R.items()
}

measures = afd_utils.measure_order
measures.remove("reliable_fraction_of_information_norm")
measures.remove("mu_plus")
pd.pivot_table(
    mislabelled,
    index="measure",
    columns="table",
    values=["lhs_uniqueness_mean", "rhs_skewness_mean"],
    sort=False,
).loc[afd_utils.measure_order, :].rename(columns=table_map)

## Figure 2 (c), lower part - means AFDs and non-AFDs in RWD$^-$

In [None]:
rwd_data = {}
for file in filter(lambda f: f.endswith(".csv"), os.listdir(os.path.join(data_path))):
    rwd_data[file] = pd.read_csv(os.path.join(data_path, file))
    rwd_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_data[file].columns
    ]

comparison = []
for name, df in (
    ("AFD(R)", rwd_minus.query("afd == True").copy()),
    ("rest", rwd_minus.query("afd == False").copy())
):
    df[
        [
            "tuples",
            "lhs_cardinality",
            "rhs_cardinality",
            "lhs_dist_alpha",
            "lhs_dist_beta",
            "rhs_dist_alpha",
            "rhs_dist_beta",
            "noise",
        ]
    ] = df.apply(
        lambda row: syn_inferrence.infer_settings(
            rwd_data[row["table"]].loc[:, [row["lhs"], row["rhs"]]],
            noise=False,
        ),
        axis="columns",
        result_type="expand",
    )
    
    df["lhs_uniqueness"] = df["lhs_cardinality"] / df["tuples"]
    df["rhs_skewness"] = df.apply(
        lambda row: syn_utils.beta_skewness(
            row["rhs_dist_alpha"], row["rhs_dist_beta"]
        ),
        axis="columns",
    )
    comparison.append({
        "name": name,
        "lhs_uniqueness_mean": df["lhs_uniqueness"].mean(),
        "rhs_skewness_mean": df["rhs_skewness"].mean(),
    })

pd.DataFrame(comparison).set_index("name", drop=True)