# SYN$^e$ Generation

In this notebook, we generate the SYN$^e$ dataset.

## Setup
First, load all the files from the RWD dataset. Futhermore, set some configuration parameters if running on an HPC cluster.

In [None]:
import os
import sys

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))

data_path = "../../data"
results_path = "../../results"
# batch_i is used to parallelize measuring datasets on the HPC cluster
batch_i = int(os.getenv("PBS_ARRAYID", 0))
# workers is used to parallelize measuring candidate FDs using joblib
workers = int(os.getenv("PBS_NUM_PPN", 1))
total_batches = 50  # total number of batches that will be run on the HPC
# files per batch
batch_size = 1
# this will be doubled: each dataset will be created as an FD and an non-FD
datasets_per_setting = 50

## Define a method to generate SYN$^e$

In [None]:
import copy
import random

from joblib import Parallel, delayed
import pandas as pd
from tqdm import tqdm

from synthetic_data import generator as gen
from synthetic_data import utils as utils


def create_noisy(n: int, max_noise: float = 0.1):
    """Create n tables with increasing noise. Important settings: both cardinalities 'normal', both skews lower than 1.0, noise ranges from 0.0 to 0.1"""
    settings = {
        "tuples": random.randint(100, 10000),
        "lhs_cardinality": 0,
        "rhs_cardinality": 0,
        "lhs_dist_alpha": random.randint(58, 100) / 100,
        "lhs_dist_beta": random.randint(10, 18) / 10,
        "rhs_dist_alpha": 1.0,
        "rhs_dist_beta": 1.0,
    }
    for side in ("lhs", "rhs"):
        while (
            utils.beta_skewness(
                settings[f"{side}_dist_alpha"], settings[f"{side}_dist_beta"]
            )
            > 1.0
        ):
            settings[f"{side}_dist_alpha"] = random.randint(35, 100) / 100
            settings[f"{side}_dist_beta"] = random.randint(10, 37) / 10

    settings["lhs_cardinality"] = random.randint(
        int(settings["tuples"] * 0.20), int(settings["tuples"] * 0.75)
    )
    settings["rhs_cardinality"] = random.randint(
        5, int(settings["lhs_cardinality"] / 2)
    )

    df_set = []
    for noise_type in ("copy", "bogus", "typo"):
        for noise in range(0, n):
            for fd in (True, False):
                settings["fd"] = fd
                settings["noise"] = random.uniform(
                    (noise / n) * max_noise, ((noise + 1) / n) * max_noise
                )  # maximum noise is 10%
                settings["n_type"] = noise_type
                df_set.append((gen.generate_SYN(**settings), copy.deepcopy(settings)))
    return df_set

## Generate SYN$^e$

Generate the data using the method defined above. Also, collect and infer the settings used for generation.

In [None]:
from synthetic_data import inferrence

with Parallel(n_jobs=workers) as parallel:
    noisy_both = Parallel()(
        delayed(create_noisy)(datasets_per_setting) for _ in tqdm(range(batch_size))
    )

noisy_dfs = []
noisy_settings = []
noisy_inferred = []
for df_set in tqdm(noisy_both):
    for df, setting in df_set:
        noisy_dfs.append(df)
        noisy_settings.append(setting)
        noisy_inferred.append(inferrence.infer_settings(df))

In [None]:
import os

for df_i, noisy_df in enumerate(noisy_dfs):
    noisy_df.to_csv(
        os.path.join(data_path, "syn_e", f"{batch_i}_{df_i}.csv"), index=False
    )

## Calculate SYN AFD measures

After generating the tables, calculate the AFD measure scores on the generated files.

In [None]:
import pandas as pd
import tqdm

from afd_measures import utils as afd_utils

noisy_results = []
for args in tqdm.tqdm(
    [
        (df, f"{batch_i}_{df_i}", "lhs", "rhs", afd_utils.measure_order)
        enumerate(noisy_dfs)
    ]
):
    noisy_results.append(afd_utils.parallelize_measuring(*args))

# filter out the empty candidate FDs
noisy_results_df = pd.DataFrame(noisy_results)
# merge it with all the settings
noisy_settings = pd.DataFrame(noisy_settings)
noisy_inferred = pd.DataFrame(noisy_inferred)
noisy_settings["table"] = noisy_settings.apply(
    lambda r: f"{batch_i}_{r.name}", axis="columns"
)
noisy_inferred["table"] = noisy_inferred.apply(
    lambda r: f"{batch_i}_{r.name}", axis="columns"
)
noisy_results_df = (
    noisy_results_df.merge(noisy_settings, on="table", suffixes=("", "_set"))
    .merge(noisy_inferred, on="table", suffixes=("", "_inferred"))
    .copy()
)
# store result to a CSV
noisy_results_df.to_csv(
    os.path.join(results_path, f"syn_e_results_{batch_i}.csv"), index=False
)