# Sensitivity Analysis

Generate the plots showing the seperation on the synthetic datasets $Err$, $Uniq$ and $Skew$.

In [None]:
import os
from typing import Dict, Tuple
import sys

import pandas as pd
import numpy as np

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils
from synthetic_data import utils as syn_utils

results_path = "../../results"

def make_SYN_data(
    df: pd.DataFrame,
    x: str,
    bins: int = 21,
    min_val: float = 0.0,
    max_val: float = 1.0,
) -> Dict[Tuple[str, bool], pd.DataFrame]:
    """This method will bin results and calculate the means for analysis."""
    _df = df.copy()
    _bins = np.linspace(min_val, max_val, num=bins)
    _df["group"] = pd.cut(_df[x], bins=_bins, include_lowest=True, labels=_bins[:-1])
    dataset_dfs = {}
    for measure in afd_utils.measure_order:
        _local_df = pd.DataFrame(index=_bins)
        for fd in (True, False):
            _local_df.loc[:, "fd" if fd else "random"] = (
                _df.query("fd == @fd").groupby("group")[measure].mean()
            )
        dataset_dfs[measure] = _local_df.dropna().copy()
    return dataset_dfs

## Figure 1 - seperation ability of SYN

### $Err$ data

In [None]:
if not os.path.exists(os.path.join(results_path, "syn_e_results_0.csv")):
    raise ValueError("No SYN results found. Execute `generate_syn_e.ipynb` first.")

noisy_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("syn_e_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    noisy_results = pd.concat(
        [noisy_results, pd.read_csv(os.path.join(results_path, file))]
    )

_df = noisy_results.query("n_type == 'copy'").copy()
noisy_bins = make_SYN_data(_df, "noise", bins=21, min_val=0.0, max_val=0.1)
for measure, df in noisy_bins.items():
    df["difference"] = df["fd"] - df["random"]
    df.to_csv(
        f"../../paper/syn_error_{measure}.dat",
        sep="\t",
        index_label="error",
    )

### $Uniq$ data

In [None]:
if not os.path.exists(os.path.join(results_path, "syn_u_results_0.csv")):
    raise ValueError("No SYN results found. Execute `generate_syn_u.ipynb` first.")

keylike_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("syn_u_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    keylike_results = pd.concat(
        [keylike_results, pd.read_csv(os.path.join(results_path, file))]
    )

keylike_results.loc[:, "lhs_relative_uniqueness"] = (
    keylike_results.loc[:, "lhs_cardinality_inferred"]
    / keylike_results.loc[:, "tuples_inferred"]
)

keylike_bins = make_SYN_data(
    keylike_results, "lhs_relative_uniqueness", bins=11, min_val=0.0, max_val=1.0
)
for measure, df in keylike_bins.items():
    df["difference"] = df["fd"] - df["random"]
    df.to_csv(
        f"../../paper/syn_keylike_{measure}.dat",
        sep="\t",
        index_label="lhs_uniq",
    )

### $Skew$ data

In [None]:
if not os.path.exists(os.path.join(results_path, "syn_s_results_0.csv")):
    raise ValueError("No SYN results found. Execute `create_syn_s.ipynb` first.")

predominant_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("syn_s_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    predominant_results = pd.concat(
        [predominant_results, pd.read_csv(os.path.join(results_path, file))]
    )

predominant_results.loc[:, "rhs_skew"] = predominant_results.loc[
    :, ["rhs_dist_alpha_inferred", "rhs_dist_beta_inferred"]
].apply(
    lambda row: syn_utils.beta_skewness(
        row["rhs_dist_alpha_inferred"], row["rhs_dist_beta_inferred"]
    ),
    axis="columns",
)

skew_bins = make_SYN_data(
    predominant_results, "rhs_skew", bins=11, min_val=0.0, max_val=10.0
)
for measure, df in skew_bins.items():
    df["difference"] = df["fd"] - df["random"]
    df.to_csv(
        f"../../paper/syn_skew_{measure}.dat",
        sep="\t",
        index_label="rhs_skew",
    )