# AFD measurements

In this notebook, we apply all AFD measures to the RWD dataset.

## Setup
First, load all the files from the RWD dataset. Futhermore, set some configuration parameters if running on an HPC cluster.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

data_path = "../../data/rwd"
gt_path = "../../data/ground_truth.csv"
results_path = "../../results"
# batch_i is used to parallelize measuring datasets on the HPC cluster
batch_i = int(os.getenv("PBS_ARRAYID", 7))
# workers is used to parallelize measuring candidate FDs using joblib
workers = int(os.getenv("PBS_NUM_PPN", 1))
total_batches = 10  # total number of batches that will be run on the HPC

rwd_data = {}
for i, file in enumerate(filter(lambda f: f.endswith(".csv"), os.listdir(data_path))):
    if i % total_batches != batch_i:
        continue
    rwd_data[file] = pd.read_csv(os.path.join(data_path, file))
    rwd_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_data[file].columns
    ]

## Calculate measures

After setup has been done, calculate the measures. The output dataframe will be written to the results for each batch.

In [None]:
import itertools

import pandas as pd
import tqdm

from afd_measures import utils as afd_utils

rwd_results = []
for args in tqdm.tqdm(
    [
        (df, table, lhs, rhs, afd_utils.measure_order)
        for table, df in rwd_data.items()
        for lhs, rhs in itertools.permutations(df.columns, 2)
    ]
):
    rwd_results.append(afd_utils.parallelize_measuring(*args))

# filter out the empty candidate FDs
rwd_results_df = pd.DataFrame(rwd_results)
if "empty" in rwd_results_df.columns:
    rwd_results_df = rwd_results_df.query("empty != True").drop(columns=["empty"])

# add the ground truth to the dataframe
ground_truth = pd.read_csv(gt_path)
ground_truth["afd"] = True
rwd_results_df = rwd_results_df.merge(
    ground_truth, on=["table", "lhs", "rhs"], how="left"
)
rwd_results_df["afd"] = rwd_results_df["afd"].fillna(False)
rwd_results_df.to_csv(
    os.path.join(results_path, f"rwd_results_{batch_i}.csv"), index=False
)