# AFD measurements

In this notebook, we apply all AFD measures to the RWD$^e$ dataset.

## Setup
First, load all the files from the RWD dataset. Futhermore, set some configuration parameters if running on an HPC cluster.

In [None]:
import os
import sys

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import utils as afd_utils

data_path = "../../data/rwd_e"
gt_path = "../../data/ground_truth.csv"
results_path = "../../results"
# batch_i is used to parallelize measuring datasets on the HPC cluster
batch_i = int(os.getenv("PBS_ARRAYID", 0))
# workers is used to parallelize measuring candidate FDs using joblib
workers = int(os.getenv("PBS_NUM_PPN", 1))
total_batches = 1  # total number of batches that will be run on the HPC

rwd_e_data = {}
for i, file in enumerate(filter(lambda f: f.endswith(".csv"), os.listdir(data_path))):
    if i % total_batches != batch_i:
        continue
    rwd_e_data[file] = pd.read_csv(os.path.join(data_path, file))
    rwd_e_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_e_data[file].columns
    ]

## Calculate measures

After setup has been done, calculate the measures. The output dataframe will be written to the results for each batch.

In [None]:
import itertools

import pandas as pd
import tqdm

from afd_measures import utils as afd_utils

rwd_e_results = []
for args in tqdm.tqdm(
    [
        (df, table, lhs, rhs, afd_utils.measure_order)
        for table, df in rwd_e_data.items()
        for lhs, rhs in itertools.permutations(df.columns, 2)
    ]
):
    rwd_e_results.append(afd_utils.parallelize_measuring(*args))
# filter out the empty candidate FDs
rwd_e_results_df = pd.DataFrame(rwd_e_results)
if "empty" in rwd_e_results_df.columns:
    rwd_e_results_df = rwd_e_results_df.query("empty != True").drop(columns=["empty"])

# add the ground truth to the dataframe
ground_truth = pd.read_csv(gt_path)
ground_truth["afd"] = True
rwd_e_results_df = rwd_e_results_df.merge(
    ground_truth, on=["table", "lhs", "rhs"], low="left"
)
rwd_e_results_df["afd"] = rwd_e_results_df["afd"].fillna(False)

# add the polluted fds to the dataframe
fds_to_pollute = pd.read_csv(os.path.join(results_path, "fds_to_pollute.csv"))
for side in ("lhs", "rhs"):
    polluted = fds_to_pollute.loc[:, ["table", "rhs"]].drop_duplicates().copy()
    polluted.columns = ["table", side]
    polluted[f"{side}_polluted"] = True
    rwd_e_results_df = rwd_e_results_df.merge(polluted, on=["table", side], how="left")
    rwd_e_results_df[f"{side}_polluted"] = rwd_e_results_df[f"{side}_polluted"].fillna(
        False
    )

fds_to_pollute["fd_polluted"] = True
rwd_e_results_df = rwd_e_results_df.merge(
    fds_to_pollute, on=["table", "lhs", "rhs"], how="left"
)
rwd_e_results_df["fd_polluted"] = rwd_e_results_df["fd_polluted"].fillna(False)

rwd_e_results_df.to_csv(
    os.path.join(results_path, f"rwd_e_results_{batch_i}.csv"), index=False
)