# SYN$^u$ Generation

In this notebook, we generate the SYN$^u$ dataset.

## Setup
First, load all the files from the RWD dataset. Futhermore, set some configuration parameters if running on an HPC cluster.

In [None]:
import os
import sys

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))

data_path = "../../data"
results_path = "../../results"
# batch_i is used to parallelize measuring datasets on the HPC cluster
batch_i = int(os.getenv("PBS_ARRAYID", 0))
# workers is used to parallelize measuring candidate FDs using joblib
workers = int(os.getenv("PBS_NUM_PPN", 1))
total_batches = 50  # total number of batches that will be run on the HPC
# files per batch
batch_size = 1
# this will be doubled: each dataset will be created as an FD and an non-FD
datasets_per_setting = 50

## Define a method to generate SYN$^u$

In [None]:
import copy
import random

from joblib import Parallel, delayed
import pandas as pd
from tqdm import tqdm

from synthetic_data import generator as gen
from synthetic_data import utils as utils


def create_key_like_LHS(n: int):
    """Create n DataFrames with increasing LHS Cardinality, approximating key-like LHS. Important settings: LHS and RHS skew lower than 1.0, LHS cardinality is high."""
    settings = {
        "tuples": random.randint(100, 10000),
        "lhs_cardinality": 0,
        "rhs_cardinality": 0,
        "lhs_dist_alpha": random.randint(35, 100) / 100,
        "lhs_dist_beta": random.randint(10, 37) / 10,
        "rhs_dist_alpha": random.randint(35, 100) / 100,
        "rhs_dist_beta": random.randint(10, 37) / 10,
        "noise": random.uniform(0.005, 0.02),
    }
    for side in ("lhs", "rhs"):
        while (
            utils.beta_skewness(
                settings[f"{side}_dist_alpha"], settings[f"{side}_dist_beta"]
            )
            > 1.0
        ):
            settings[f"{side}_dist_alpha"] = random.randint(35, 100) / 100
            settings[f"{side}_dist_beta"] = random.randint(10, 37) / 10

    df_set = []
    for lhs_card in range(0, n):
        settings["lhs_cardinality"] = int(((lhs_card + 1) / 5) * settings["tuples"])
        settings["rhs_cardinality"] = random.randint(
            5, int(settings["lhs_cardinality"] / 2)
        )
        for fd in (True, False):
            settings["fd"] = fd
            df_set.append((gen.generate_SYN(**settings), copy.deepcopy(settings)))
    return df_set

## Generate SYN$^u$

Generate the data using the method defined above. Also, collect and infer the settings used for generation.

In [None]:
from synthetic_data import inferrence

with Parallel(n_jobs=workers) as parallel:
    keylike_both = Parallel()(
        delayed(create_key_like_LHS)(datasets_per_setting)
        for _ in tqdm(range(batch_size))
    )

keylike_dfs = []
keylike_settings = []
keylike_inferred = []
for df_set in tqdm(keylike_both):
    for df, setting in df_set:
        keylike_dfs.append(df)
        keylike_settings.append(setting)
        keylike_inferred.append(inferrence.infer_settings(df))

In [None]:
import os

for df_i, keylike_df in enumerate(keylike_dfs):
    keylike_df.to_csv(
        os.path.join(data_path, "syn_u", f"{batch_i}_{df_i}.csv"), index=False
    )

## Calculate SYN AFD measures

After generating the tables, calculate the AFD measure scores on the generated files.

In [None]:
import pandas as pd
import tqdm

from afd_measures import utils as afd_utils

keylike_results = []
for args in tqdm.tqdm(
    [
        (df, f"{batch_i}_{df_i}", "lhs", "rhs", afd_utils.measure_order)
        enumerate(keylike_dfs)
    ]
):
    keylike_results.append(afd_utils.parallelize_measuring(*args))

# filter out the empty candidate FDs
keylike_results_df = pd.DataFrame(keylike_results)
# merge it with all the settings
keylike_settings = pd.DataFrame(keylike_settings)
keylike_inferred = pd.DataFrame(keylike_inferred)
keylike_settings["table"] = keylike_settings.apply(
    lambda r: f"{batch_i}_{r.name}", axis="columns"
)
keylike_inferred["table"] = keylike_inferred.apply(
    lambda r: f"{batch_i}_{r.name}", axis="columns"
)
keylike_results_df = (
    keylike_results_df.merge(keylike_settings, on="table", suffixes=("", "_set"))
    .merge(keylike_inferred, on="table", suffixes=("", "_inferred"))
    .copy()
)
# store result to a CSV
keylike_results_df.to_csv(
    os.path.join(results_path, f"syn_u_results_{batch_i}.csv"), index=False
)