# Efficient Computation

Measure the computation time needed to calculate all FD candidates in RWD.

In [None]:
import cProfile
import os
import itertools
import signal
import sys
import time

import pandas as pd

# for Jupyter notebooks: add the path of 'code' to allow importing module
sys.path.append(os.path.join(os.getcwd(), ".."))
from afd_measures import measures as afd_measures
from afd_measures import utils as afd_utils

data_path = "../../data/rwd"
timing_path = "../../timings"

def signal_handler(signal, frame):
    search_frame = frame
    while search_frame is not None and "rwd_results" not in search_frame.f_locals:
        search_frame = search_frame.f_back
    if search_frame is None:
        sys.stderr.write(f"Could not find rwd_results.\n")
        exit(1)
    measure = search_frame.f_locals["measure"]
    rwd_results = search_frame.f_locals["rwd_results"]
    pd.DataFrame(rwd_results).to_csv(os.path.join(timing_path, f"timed_results_{measure}.csv"))
    while search_frame is not None and "profiler" not in search_frame.f_locals:
        search_frame = search_frame.f_back
    if search_frame is None:
        sys.stderr.write(frame.f_globals + "\m")
        exit(1)
    search_frame.f_locals["profiler"].dump_stats(os.path.join(timing_path, f"timed_profile_{measure}.pr"))
    sys.stderr.write("Stopped gracefully.\n")
    exit(0)


def run_rwd_for_measure(measure: str) -> None:
    rwd_results = []
    for df, table, lhs, rhs in [
            (df, table, lhs, rhs)
            for table, df in rwd_data.items()
            for lhs, rhs in itertools.permutations(df.columns, 2)
    ]:
        result = {
            "table": table,
            "lhs": lhs,
            "rhs": rhs,
        }
        _df = df.loc[:, [lhs, rhs]].dropna().copy()
        if _df.empty:
            result["empty"] = True
        else:
            result["trivial_fd"] = afd_utils.is_trivial_fd(_df, lhs, rhs)
            result["exact_fd"] = afd_utils.is_perfect_fd(_df, lhs, rhs)
            if result["trivial_fd"]:
                result[measure] = 1.0
            else:
                result[measure] = getattr(afd_measures, measure)(df, lhs, rhs)
        rwd_results.append(result)
    pd.DataFrame(rwd_results).to_csv(os.path.join(timing_path, f"timed_results_{measure}.csv"))


rwd_data = {}
for i, file in enumerate(filter(lambda f: f.endswith(".csv"), os.listdir(data_path))):
    rwd_data[file] = pd.read_csv(os.path.join(data_path, file))
    rwd_data[file].columns = [
        afd_utils.clean_colname(c) for c in rwd_data[file].columns
    ]

signal.signal(signal.SIGTERM, signal_handler)
for measure in afd_utils.measure_order:
    with cProfile.Profile() as profiler:
        sys.stderr.write(f"Starting {measure} now.\n")
        profiler.runcall(run_rwd_for_measure, measure)
        profiler.dump_stats(os.path.join(timing_path, f"timed_profile_{measure}.pr"))

## Table 4 - measure runtimes

In [None]:
timings = []
for measure in afd_utils.measure_order:
    if not os.path.exists(os.path.join(timing_path, f"timed_profile_{measure}.pr")):
        continue
    pr = pstats.Stats(os.path.join(timing_path, f"timed_profile_{measure}.pr"))
    df = pd.read_csv(os.path.join(timing_path, f"timed_results_{measure}.csv"))
    timings.append({"measure": measure, "seconds": pr.total_tt, "candidates": df.loc[:,measure].count()})
timings_df = pd.DataFrame(timings)
timings_df