# Sensitivity Analysis

Generate the plots showing the seperation on the synthetic datasets $Err$, $Uniq$ and $Skew$.

In [None]:
import os
from typing import Dict, Tuple
import sys

import pandas as pd
import numpy as np

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils
from synthetic_data import utils as syn_utils

results_path = "../../results"

def make_SYN_data(
    df: pd.DataFrame,
    x: str,
    bins: int = 21,
    min_val: float = 0.0,
    max_val: float = 1.0,
) -> Dict[Tuple[str, bool], pd.DataFrame]:
    """This method will bin results and calculate the means for analysis."""
    _df = df.copy()
    _bins = np.linspace(min_val, max_val, num=bins)
    _df["group"] = pd.cut(_df[x], bins=_bins, include_lowest=True, labels=_bins[:-1])
    dataset_dfs = {}
    for measure in afd_utils.measure_order:
        _local_df = pd.DataFrame(index=_bins)
        for fd in (True, False):
            _local_df.loc[:, "fd" if fd else "random"] = (
                _df.query("fd == @fd").groupby("group")[measure].mean()
            )
        dataset_dfs[measure] = _local_df.dropna().copy()
    return dataset_dfs

## Figure 3 - measure values on SYN

### $Err$ data

In [None]:
if not os.path.exists(os.path.join(results_path, "syn_e_results_0.csv")):
    raise ValueError("No SYN results found. Execute `generate_syn_e.ipynb` first.")

noisy_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("syn_e_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    noisy_results = pd.concat(
        [noisy_results, pd.read_csv(os.path.join(results_path, file))]
    )

_df = noisy_results.query("n_type == 'copy'").copy()
noisy_bins = make_SYN_data(_df, "noise", bins=21, min_val=0.0, max_val=0.1)
for measure, df in noisy_bins.items():
    df["difference"] = df["fd"] - df["random"]
    df.to_csv(
        f"../../paper/syn_error_{measure}.dat",
        sep="\t",
        index_label="error",
    )

### $Uniq$ data

In [None]:
if not os.path.exists(os.path.join(results_path, "syn_u_results_0.csv")):
    raise ValueError("No SYN results found. Execute `generate_syn_u.ipynb` first.")

keylike_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("syn_u_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    keylike_results = pd.concat(
        [keylike_results, pd.read_csv(os.path.join(results_path, file))]
    )

keylike_results.loc[:, "lhs_relative_uniqueness"] = (
    keylike_results.loc[:, "lhs_cardinality_inferred"]
    / keylike_results.loc[:, "tuples_inferred"]
)

keylike_bins = make_SYN_data(
    keylike_results, "lhs_relative_uniqueness", bins=11, min_val=0.0, max_val=1.0
)
for measure, df in keylike_bins.items():
    df["difference"] = df["fd"] - df["random"]
    df.to_csv(
        f"../../paper/syn_keylike_{measure}.dat",
        sep="\t",
        index_label="lhs_uniq",
    )

### $Skew$ data

In [None]:
if not os.path.exists(os.path.join(results_path, "syn_s_results_0.csv")):
    raise ValueError("No SYN results found. Execute `create_syn_s.ipynb` first.")

predominant_results = pd.DataFrame()
for file in filter(
    lambda f: f.startswith("syn_s_results_") and f.endswith(".csv"),
    os.listdir(results_path),
):
    predominant_results = pd.concat(
        [predominant_results, pd.read_csv(os.path.join(results_path, file))]
    )

predominant_results.loc[:, "rhs_skew"] = predominant_results.loc[
    :, ["rhs_dist_alpha_inferred", "rhs_dist_beta_inferred"]
].apply(
    lambda row: syn_utils.beta_skewness(
        row["rhs_dist_alpha_inferred"], row["rhs_dist_beta_inferred"]
    ),
    axis="columns",
)

skew_bins = make_SYN_data(
    predominant_results, "rhs_skew", bins=11, min_val=0.0, max_val=10.0
)
for measure, df in skew_bins.items():
    df["difference"] = df["fd"] - df["random"]
    df.to_csv(
        f"../../paper/syn_skew_{measure}.dat",
        sep="\t",
        index_label="rhs_skew",
    )

## Interactive plots

Using `plotly`, we plot the same data as interactive plots, making it possible to inspect them in more detail.

In [None]:
from typing import Dict, List

import plotly.graph_objects as go
import plotly.colors as plotly_colors

colors = [
    ( 55,126,184),
    ( 77,175, 74),
    (152, 78,163),
    (255,127,  0),
    (166, 86, 40),
]

measure_classes = {
    "Violation": ['rho', 'g2', 'g3', 'g3_prime'],
    "Shannon": ['shannon_g1_plus', 'fraction_of_information', 'reliable_fraction_of_information_plus', 'reliable_fraction_of_information_norm', 'smoothed_fraction_of_information'],
    "Logical": ['g1_prime', 'pdep', 'tau', 'mu_plus'],
}

measure_map = {
    "mu_plus": "$\mu^+$",
    "g3_prime": "$g'_3$",
    "g3": r"$g_3$",
    "pdep": r"$\text{pdep}$",
    "tau": r"$\tau$",
    "rho": r"$\rho$",
    "g2": r"$g_2$",
    "fraction_of_information": r"$\text{FI}$",
    "reliable_fraction_of_information_norm": r"$\text{RFI}^{'+}$",
    "g1_prime": r"$g_1, g'_1$",
    "g1": r"$g_1$",
    "reliable_fraction_of_information_plus": r"$\text{RFI}^+$",
    "smoothed_fraction_of_information": r"$\text{SFI}$",
    "shannon_g1_plus": r"$g_1^{S}$",
}


def create_interactive_plots(data_tables: Dict[str, pd.DataFrame], x_axis_title: str, syn_name: str) -> List[go.Figure]:
    """
    Create a plotly line-plots from per measure data tables created by `make_SYN_data`.
    This method will create three plots, one for each measure class (set by `measure_classes`).
    """
    figures = []
    for measure_class, measures in measure_classes.items():
        fig = go.Figure(
            layout={
                "width": 1000,
                "height": 750,
                "xaxis": {"title": x_axis_title},
                "yaxis": {"title": "measure values", "range": [-0.05, 1.05]},
                "legend": {
                    "orientation": "h",
                    "yanchor": "top",
                    "y": 1.05,
                    "xanchor": "center",
                    "x": 0.5,
                    "entrywidth": 50,
                },
                "title": f"Differences of binned measure values of class {measure_class} for ${syn_name}$",
            },
        )
        for i, measure in enumerate(measures):
            df = data_tables[measure]
            for version in ("fd", "random"):
                fig.add_trace(
                    go.Scatter(
                        x=df.index,
                        y=df.loc[:, version],
                        mode="lines",
                        line={
                            "color": f"rgb{str(colors[i])}",
                            "dash": "solid" if version == "fd" else "dash",
                        },
                        name=measure_map[measure],
                        showlegend=True if version == "fd" else False,
                        legendgroup=measure,
                    )
                )
        figures.append(fig)
    return figures

### $Err$ data

In [None]:
for fig in create_interactive_plots(noisy_bins, x_axis_title="Error rate", syn_name="SYN^e"):
    fig.show()

### $Uniq$ data

In [None]:
for fig in create_interactive_plots(keylike_bins, x_axis_title="LHS uniqueness", syn_name="SYN^u"):
    fig.show()

### $Skew$ data

In [None]:
for fig in create_interactive_plots(skew_bins, x_axis_title="RHS skew", syn_name="SYN^s"):
    fig.show()