# SYN$^s$ Generation

In this notebook, we generate the SYN$^s$ dataset.

## Setup
First, load all the files from the RWD dataset. Futhermore, set some configuration parameters if running on an HPC cluster.

In [None]:
import os
import sys

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))

data_path = "../../data"
results_path = "../../results"
# batch_i is used to parallelize measuring datasets on the HPC cluster
batch_i = int(os.getenv("PBS_ARRAYID", 0))
# workers is used to parallelize measuring candidate FDs using joblib
workers = int(os.getenv("PBS_NUM_PPN", 1))
total_batches = 50  # total number of batches that will be run on the HPC
# files per batch
batch_size = 1
# this will be doubled: each dataset will be created as an FD and an non-FD
datasets_per_setting = 50

## Define a method to generate SYN$^s$

In [None]:
import copy
import random

from joblib import Parallel, delayed
import pandas as pd
from tqdm import tqdm

from synthetic_data import generator as gen
from synthetic_data import utils as utils


def create_predominant_RHS(n: int, skew_lookup: pd.DataFrame, max_skew: float = 10.0):
    """Create a predominant RHS. Important settings: LHS skew lower than 1.0, RHS cardinality low, RHS skew higher than 1.0."""
    settings = {
        "tuples": random.randint(100, 10000),
        "lhs_cardinality": 0,
        "rhs_cardinality": 0,
        "lhs_dist_alpha": random.randint(58, 100) / 100,
        "lhs_dist_beta": random.randint(10, 18) / 10,
        "rhs_dist_alpha": 1.0,
        "rhs_dist_beta": 1.0,
        "noise": random.uniform(0.005, 0.02),
    }
    settings["lhs_cardinality"] = random.randint(
        int(settings["tuples"] * 0.20), int(settings["tuples"] * 0.75)
    )
    while (
        utils.beta_skewness(settings["lhs_dist_alpha"], settings["lhs_dist_beta"]) > 0.5
    ):
        settings["lhs_dist_alpha"] = random.randint(58, 100) / 100
        settings["lhs_dist_beta"] = random.randint(10, 18) / 10
    df_set = []
    for rhs_pred in range(0, n):
        lower_skew = (max_skew * rhs_pred) / n
        upper_skew = (max_skew * rhs_pred + max_skew) / n
        alpha, beta = random.choice(
            skew_lookup.query(f"skew > {lower_skew} and skew < {upper_skew}").index
        )
        settings["rhs_dist_alpha"] = alpha
        settings["rhs_dist_beta"] = beta
        settings["rhs_cardinality"] = random.randint(
            5, int(settings["lhs_cardinality"] / 2)
        )
        for fd in (True, False):
            settings["fd"] = fd
            df_set.append((gen.generate_SYN(**settings), copy.deepcopy(settings)))
    return df_set

## Generate SYN$^s$

Generate the data using the method defined above. Also, collect and infer the settings used for generation.

In [None]:
from synthetic_data import inferrence

with Parallel(n_jobs=workers) as parallel:
    predominant_both = Parallel()(
        delayed(create_predominant_RHS)(datasets_per_setting)
        for _ in tqdm(range(batch_size))
    )

predominant_dfs = []
predominant_settings = []
predominant_inferred = []
for df_set in tqdm(predominant_both):
    for df, setting in df_set:
        predominant_dfs.append(df)
        predominant_settings.append(setting)
        predominant_inferred.append(inferrence.infer_settings(df))

In [None]:
import os

for df_i, predominant_df in enumerate(predominant_dfs):
    predominant_df.to_csv(
        os.path.join(data_path, "syn_s", f"{batch_i}_{df_i}.csv"), index=False
    )

## Calculate SYN AFD measures

After generating the tables, calculate the AFD measure scores on the generated files.

In [None]:
import pandas as pd
import tqdm

from afd_measures import utils as afd_utils

predominant_results = []
for args in tqdm.tqdm(
    [
        (df, f"{batch_i}_{df_i}", "lhs", "rhs", afd_utils.measure_order)
        enumerate(predominant_dfs)
    ]
):
    predominant_results.append(afd_utils.parallelize_measuring(*args))

# filter out the empty candidate FDs
predominant_results_df = pd.DataFrame(predominant_results)
# merge it with all the settings
predominant_settings = pd.DataFrame(predominant_settings)
predominant_inferred = pd.DataFrame(predominant_inferred)
predominant_settings["table"] = predominant_settings.apply(
    lambda r: f"{batch_i}_{r.name}", axis="columns"
)
predominant_inferred["table"] = predominant_inferred.apply(
    lambda r: f"{batch_i}_{r.name}", axis="columns"
)
predominant_results_df = (
    predominant_results_df.merge(
        predominant_settings, on="table", suffixes=("", "_set")
    )
    .merge(predominant_inferred, on="table", suffixes=("", "_inferred"))
    .copy()
)
# store result to a CSV
predominant_results_df.to_csv(
    os.path.join(results_path, f"syn_s_results_{batch_i}.csv"), index=False
)