# Plots

In [None]:
import json
import math
from typing import Type, Dict, Any

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

## Load Data

In [None]:
AlgoId: Type = str
ResultDict: Type = Dict[int, Dict[str, Any]]

In [None]:
OUT_DIR = "../../out"

def load_mr_results() -> ResultDict:
    with open(f"{OUT_DIR}/mr_results.json", "r") as f:
        return json.load(f)

def load_seq_results() -> ResultDict:
    with open(f"{OUT_DIR}/seq_results.json", "r") as f:
        return json.load(f)

In [None]:
mr_results: ResultDict = load_mr_results()
seq_results: ResultDict = load_seq_results()

In [None]:
MR_ID: AlgoId = "MR-APDSS"
SEQ_ID: AlgoId = "SEQ-APDSS"

DATASET_NAME_KEY = "d_name"
N_EXECS_KEY = "n_execs"
THRESHOLD_KEY = "threshold"
TIME_KEY = "time"
SIM_DOCS_KEY = "time"

In [None]:
all_algo_results = {
    MR_ID: mr_results,
    SEQ_ID: seq_results
}

In [None]:
ALGO_ID_COL = "algo_id"

def plot_algo_results(results: Dict[AlgoId, ResultDict]) -> plt.Figure:
    results_df_by_algo: Dict[AlgoId, pd.DataFrame] = {}
    for algo_id, result in results.items():
        results_df_by_algo[algo_id] = get_algo_results_df(result)

    # seq_results has one entry per dataset -> need to duplicate its entries
    #   so that there is one for each n_execs value in the other two dfs
    # This is to later plot the sequential df as a straight line
    seq_res_df = results_df_by_algo[SEQ_ID]
    unique_d_names = seq_res_df[DATASET_NAME_KEY].unique()
    unique_n_execs_values = spdf_res_df[N_EXECS_KEY].unique()
    seq_res_duplicated = pd.DataFrame()
    for _ in range(unique_n_execs_values.size):
        seq_res_duplicated = pd.concat([seq_res_duplicated, seq_res_df])

    # Sorting by dataset allows to assign n_execs values directly as done below
    seq_res_duplicated = seq_res_duplicated.sort_values(by=[DATASET_NAME_KEY])
    seq_res_duplicated[N_EXECS_KEY] = unique_n_execs_values.tolist()*unique_d_names.size
    results_df_by_algo[SEQ_ID] = seq_res_duplicated

    results_df: pd.DataFrame = pd.DataFrame()
    for algo_id, res_df in results_df_by_algo.items():
        res_df[ALGO_ID_COL] = algo_id
        results_df = pd.concat([results_df, res_df])

    results_df_by_dataset: Dict[str, pd.DataFrame] = {}
    for d_name in unique_d_names:
        results_df_by_dataset[d_name] = results_df[results_df[DATASET_NAME_KEY] == d_name]

    tot_plots = len(results_df_by_dataset)
    n = math.ceil(math.sqrt(tot_plots))

    n_rows = math.ceil(tot_plots / n) if n < tot_plots else 1
    n_cols = n
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols*8, n_rows*8))
    for i, key_value in enumerate(results_df_by_dataset.items()):
        d_name, df = key_value
        ax: plt.Axes = axes[i // n, i % n] if n_rows > 1 else axes[i]

        sns.lineplot(ax=ax, data=df, x=N_EXECS_KEY, y=TIME_KEY, hue=ALGO_ID_COL, marker="o")
        ax.set_title(f"[{d_name}] Execution Times")
        ax.set_xlabel("#Executors")
        ax.set_ylabel("Time [s]")

        max_time = df[TIME_KEY].max()
        ax.set_yticks(
            np.arange(
                0,
                max_time + math.floor(0.15*max_time),
                math.floor(max(20, math.ceil((max_time / 15) // 50)*50))
            )
        )

    return fig


def get_algo_results_df(result: ResultDict) -> pd.DataFrame:
    return pd.DataFrame.from_records(data=list(result.values()))

In [None]:
fig = plot_algo_results(results=all_algo_results)

In [None]:
fig.savefig("../../images/results-png", dpi=200)