In [97]:
import pandas as pd
import numpy as np
import datetime

In [130]:
import os
from pathlib import Path

import re

In [133]:
pattern = re.compile(r"\[INFO\] ConstructLabels\d+-\d+-(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
new_pattern_nd = re.compile(r"\[INFO\] ConstructLabels_\d{4}-\d{2}-\d{2}--\d{2}:\d{2}:\d{2}\.\d+_\d{1}-\d+-(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")

pattern_name = re.compile(r"(mlreal|sreal|for d| for nd)(\w+)")
file_name_pattern = re.compile(r"ConstructLabels\d{1}.log")

In [134]:
def get_info(lines, pattern_to_use):
    start_date = re.search(pattern_to_use, lines[0]).groups()[0]
    end_date  = re.search(pattern_to_use, lines[1]).groups()[0]
    size, name = re.search(pattern_name, lines[0]).groups()

    if "for " in size:
        size = size.split("for ")[1]

    start_date_time = datetime.datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_date_time = datetime.datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S")
    return [name, size, str(end_date_time-start_date_time)]

In [None]:
for dtype in ["d", "nd", "mlreal", "sreal"]:
    logs_path = Path(f"../experiment_results/logger_raw/{dtype}/logger")

    time_data = []

    for f in os.listdir(logs_path):
        if "ConstructLabels" in f:
            if file_name_pattern.search(f):
                pattern_to_use = pattern
            else:
                pattern_to_use = new_pattern_nd

            with open(logs_path / f, mode="r", encoding="utf-8") as fd:
                lines = fd.readlines()
                for i in range(0, len(lines), 4):
                    time_data.append(get_info(lines[i:i+4], pattern_to_use))
    
    df = pd.DataFrame(time_data, columns=["Dataset", "Size", "Time"])
    df.to_csv(f"{df['Size'][0]}-tLOPPED")

Put all the files resulting from the cells above in the folder that the `base_path` points to.

In [99]:
base_path = Path("../experiment_results/time")

In [100]:
sreal_graphs = 56
mlreal_graphs = 59
nd_graphs = 3839
d_graphs = 3840

In [101]:
gtrie_sreal_details = {"cores_per_task": 6, "number_of_tasks": 8}
gtrie_mlreal_details = {"cores_per_task": 6, "number_of_tasks": 7}
gtrie_d_details = {
    "cores_per_task": 8,
    "number_of_tasks": 6,
}  # Despite higher count in config, effectively used 6 because of generator partition
gtrie_nd_details = {
    "cores_per_task": 8,
    "number_of_tasks": 7,
}  # Forest Fire and Random Geometric will be normalized to this amount of cores.
model_details = {"number_of_tasks": 1, "cores_per_task": 5888}

In [102]:
dfs = {}
for file in sorted(os.listdir(base_path)):
    if "mlreal-time" in file:
        dfs["mlreal"] = pd.read_csv(base_path / file)
    elif "sreal-time" in file:
        dfs["sreal"] = pd.read_csv(base_path / file)
    elif "nd-time" in file:
        dfs["nd"] = pd.read_csv(base_path / file)
    elif "d-time" in file:
        dfs["d"] = pd.read_csv(base_path / file)
    else:
        dfs["other"] = pd.read_csv(base_path / file)

In [103]:
mask = dfs["nd"].loc[:, "Dataset"] == "FOREST_FIRE"
mask_rand_geom = dfs["nd"].loc[:, "Dataset"] == "LRANDOM_GEOMETRIC"

dfs["nd"].loc[mask, "Time"] = pd.to_timedelta(dfs["nd"][mask]["Time"]) * (
    round(64 * 0.65) / 8
)

dfs["nd"].loc[mask_rand_geom, "Time"] = pd.to_timedelta(
    dfs["nd"][mask_rand_geom]["Time"]
) * (round(64 * 0.25) / 8)

$N$ tasks, each using $X_i$ time with $i \in \{1 \dots N\}$. Each tasks uses $K$ cores.

$$
\text{Total Time Used} = \sum_{i=1}^N X_i
$$

$$
\text{Total Core Time} = \sum_{i=1}^N (KX_i)
$$

$$
\text{Average Time per Core} = \frac{1}{KN} \sum_{i=1}^N (KX_i)  = \frac{1}{N} \cdot \text{Total Time Used}
$$

In [104]:
def get_stats(times, details, cut_time=1):
    total_elapsed_time = pd.to_timedelta(times).sum()
    total_elapsed_time *= cut_time
    total_core_time = total_elapsed_time*details["cores_per_task"]
    average_time_per_core = total_elapsed_time/details["number_of_tasks"]
    return total_elapsed_time, total_core_time, average_time_per_core

In [117]:
average_core_times = []
total_core_times = []
elapsed_times = []
data_types = []
model_types = []

for d in ["mlreal", "sreal", "d", "nd"]:
    for m in ["gtrie", "gnn"]:
        cut_time = 1
        times = dfs[d]["Time"]
        details = eval(f"gtrie_{d}_details")

        if m == "gnn":
            times = [max(dfs["other"][dfs["other"]["Size"] == d]["Time"])]
            details = model_details

        if d == "d" or d == "nd":
            cut_time = 0.1

        elapsed, total_core, average_core = get_stats(times, details, cut_time=cut_time)

        average_core_times.append(average_core)
        total_core_times.append(total_core)
        elapsed_times.append(elapsed)
        data_types.append(d)
        model_types.append(m)


df_times = pd.DataFrame(
    np.array([average_core_times, total_core_times, elapsed_times, data_types, model_types]),
).transpose()
df_times.columns = ["AVG Core Time", "Total Core Time", "Elapsed", "Data Type", "Model Type"]

In [None]:
df_times

Note: You can see that the *mlreal* (and *d*, *nd*) with GTrie has AVG Core time smaller than the elapsed time. This stems from aninefficient scheduling of the parallel tasks due to the very large size of some graphs in this category when compared to others, after all, the graphs go from medium to large :). There **was not** a more efficient way of scheduling this with this version of Gtrie. We could have used a more complex native parallel GTrie, but that would bring another set of difficulties. Hence, the result is still significant.


For task A and task B:

$$
\text{Speedup} = \frac{\text{Total Time Used}^{(A)}}{\text{Total Time Used}^{(B)}}
$$

$$
\text{Core Efficiency Gain} = \frac{\text{Total Core Time}^{(A)}}{\text{Total Core Time}^{(B)}}
$$

In [122]:
speedup_mlreal = (
    df_times[df_times["Data Type"] == "mlreal"]["Elapsed"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "mlreal"]["Elapsed"].reset_index(drop=True)[1]
)
core_efficiency_gain_mlreal = (
    df_times[df_times["Data Type"] == "mlreal"]["Total Core Time"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "mlreal"]["Total Core Time"].reset_index(drop=True)[1]
)

In [123]:
speedup_sreal = (
    df_times[df_times["Data Type"] == "sreal"]["Elapsed"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "sreal"]["Elapsed"].reset_index(drop=True)[1]
)
core_efficiency_gain_sreal = (
    df_times[df_times["Data Type"] == "sreal"]["Total Core Time"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "sreal"]["Total Core Time"].reset_index(drop=True)[1]
)

In [124]:
speedup_d = (
    df_times[df_times["Data Type"] == "d"]["Elapsed"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "d"]["Elapsed"].reset_index(drop=True)[1]
)
core_efficiency_gain_d = (
    df_times[df_times["Data Type"] == "d"]["Total Core Time"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "d"]["Total Core Time"].reset_index(drop=True)[1]
)

In [125]:
speedup_nd = (
    df_times[df_times["Data Type"] == "nd"]["Elapsed"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "nd"]["Elapsed"].reset_index(drop=True)[1]
)
core_efficiency_gain_nd = (
    df_times[df_times["Data Type"] == "nd"]["Total Core Time"].reset_index(drop=True)[0]
    / df_times[df_times["Data Type"] == "nd"]["Total Core Time"].reset_index(drop=True)[1]
)

In [None]:
speedup_mlreal,core_efficiency_gain_mlreal

In [None]:
speedup_sreal,core_efficiency_gain_sreal

In [None]:
speedup_d,core_efficiency_gain_d

In [None]:
speedup_nd,core_efficiency_gain_nd