# SYN plot

In this notebook, we generate the data to plot the figures for SYN.

## Setup

Set the execution path for Python and set the results path.

In [None]:
import sys
import os

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), '..'))

results_path = "../../results"

## Define generic method

In [None]:
from typing import Dict, Tuple

import pandas as pd
import numpy as np

measure_order = [
    "rho", 'g2', 'g3', 'g3_prime', 'fraction_of_information', 'reliable_fraction_of_information_prime', 'smoothed_fraction_of_information', 'g1', 'g1_prime', 'pdep', 'tau', 'mu_prime'
]

def make_SYN_err_data(
    df: pd.DataFrame,
    x: str,
    bins: int = 21,
    min_val: float = 0.0,
    max_val: float = 1.0,
) -> Dict[Tuple[str, bool], pd.DataFrame]:
    _df = df.copy()
    _bins = np.linspace(min_val, max_val, num=bins)
    _df["group"] = pd.cut(_df[x], bins=_bins, include_lowest=True, labels=_bins[:-1])
    dataset_dfs = {}
    for measure in measure_order:
        _local_df = pd.DataFrame(index=_bins)
        for fd in (True, False):
            _local_df.loc[:, 'fd' if fd else 'random'] = _df.query("fd == @fd").groupby("group")[measure].mean()
        dataset_dfs[measure] = _local_df.dropna().copy()
    return dataset_dfs


## Create the SYN$^e$ plot data

First, collect the results of SYN$^e$ and generate the plotting data for it.

In [None]:
import os

import pandas as pd

if not os.path.exists(os.path.join(results_path, f'syn_e_results_0.csv')):
    raise ValueError('No SYN results found. Execute `create_syn_e.ipynb` first.')

noisy_results = pd.DataFrame()
for file in filter(lambda f: f.startswith('syn_e_results_') and f.endswith('.csv'), os.listdir(results_path)):
    noisy_results = pd.concat([noisy_results, pd.read_csv(os.path.join(results_path, file))])
    
for noise_type in ("copy", "bogus", "typo"):
    _df = noisy_results.query("n_type == @noise_type").copy()
    data_tables = make_SYN_err_data(_df, 'noise', bins=21, min_val=0.0, max_val=0.1)
    for measure, df in data_tables.items():
        df.to_csv(f'../../paper/figure2_syn_error_{noise_type}_{measure}.dat', sep='\t', index_label='error')

## Create the SYN$^u$ plot data

First, collect the results of SYN$^u$ and generate the plotting data for it.

In [None]:
import os

import pandas as pd

if not os.path.exists(os.path.join(results_path, f'syn_u_results_0.csv')):
    raise ValueError('No SYN results found. Execute `create_syn_s.ipynb` first.')

keylike_results = pd.DataFrame()
for file in filter(lambda f: f.startswith('syn_u_results_') and f.endswith('.csv'), os.listdir(results_path)):
    keylike_results = pd.concat([keylike_results, pd.read_csv(os.path.join(results_path, file))])

keylike_results.loc[:, "lhs_relative_uniqueness"] = (
    keylike_results.loc[:, "lhs_cardinality_inferred"] / keylike_results.loc[:, "tuples_inferred"]
)

data_tables = make_SYN_err_data(keylike_results, 'lhs_relative_uniqueness', bins=11, min_val=0.0, max_val=1.0)
for measure, df in data_tables.items():
    df.to_csv(f'../../paper/figure2_syn_keylike_{measure}.dat', sep='\t', index_label='lhs_uniq')

## Create the SYN$^s$ plot data

First, collect the results of SYN$^s$ and generate the plotting data for it.

In [None]:
import os

import pandas as pd

if not os.path.exists(os.path.join(results_path, f'syn_s_results_0.csv')):
    raise ValueError('No SYN results found. Execute `create_syn_s.ipynb` first.')

predominant_results = pd.DataFrame()
for file in filter(lambda f: f.startswith('syn_s_results_') and f.endswith('.csv'), os.listdir(results_path)):
    predominant_results = pd.concat([predominant_results, pd.read_csv(os.path.join(results_path, file))])

predominant_results.loc[:, "rhs_skew"] = predominant_results.loc[:, ["rhs_dist_alpha_inferred", "rhs_dist_beta_inferred"]].apply(
    lambda row: sd.beta_skewness(row["rhs_dist_alpha_inferred"], row["rhs_dist_beta_inferred"]),
    axis="columns",
)

data_tables = make_SYN_err_data(predominant_results, 'rhs_skew', bins=11, min_val=0.0, max_val=10.0)
for measure, df in data_tables.items():
    df.to_csv(f'../../paper/figure2_syn_rhsskew_{measure}.dat', sep='\t', index_label='rhs_skew')