In [None]:
%load_ext autoreload
%autoreload 2

import plotly.express as px
import pandas as pd
import numpy as np
from scipy.stats import (
    bootstrap,
    gmean,
)
from IPython.display import display

from duet.process import *
from duet.constants import *
from common import *

df_raw = load_raw()
unique_suites = df_raw[RF.suite].unique()
df_raw.describe()

# Overlaps

In [None]:
iter_overlap_pair_dfs = []

prep_type = "preprocess"
df_raw_overlaps = compute_overlaps(df_raw)
df_raw_overlaps[prep_type] = False
df_raw_overlaps[DF.overlap_rate] = None
iter_overlap_pair_dfs.append((df_raw_overlaps, df_raw))

df_prep = preprocess_data(df_raw)
df_prep_overlaps = compute_overlaps(df_prep)
df_prep_overlaps[prep_type] = True
df_prep_overlaps[DF.overlap_rate] = None
iter_overlap_pair_dfs.append((df_prep_overlaps, df_prep))

for overlap_ratio in np.arange(0.1, 1, 0.2):
    df_prep_overlaps_prep = preprocess_overlaps(df_prep_overlaps, p=overlap_ratio)
    df_prep_overlaps_prep[DF.overlap_rate] = overlap_ratio
    df_prep_overlaps_prep[prep_type] = True
    iter_overlap_pair_dfs.append((df_prep_overlaps_prep, df_prep))

df_over = pd.concat([pair[0] for pair in iter_overlap_pair_dfs])
df_over

In [None]:
def aggregate_size_and_time(df_over, df_iter):
    """Compute iteration/overlap aggregate count and total time"""
    df_over_agg = (
        df_over.groupby(BENCHMARK_ENV_COL + [prep_type, DF.overlap_rate])
        .agg(
            overlap_count=(RF.overlap_time_ns, np.size),
            total_overlap_time=(RF.overlap_time_ns, np.sum),
        )
        .reset_index()
    )

    df_iter_agg = df_iter.groupby(BENCHMARK_ENV_COL).agg(
        iteration_count=(RF.time_ns, np.size), total_iteration_time=(RF.time_ns, np.sum)
    )
    # Total iteration time is doubled because of duet run thus halve it
    df_iter_agg["total_iteration_time"] = df_iter_agg["total_iteration_time"] / 2

    return df_over_agg.merge(df_iter_agg, on=BENCHMARK_ENV_COL).reset_index(drop=True)


df_over_agg = pd.concat(
    [
        aggregate_size_and_time(df_overlaps, df_iters)
        for df_overlaps, df_iters in iter_overlap_pair_dfs
    ]
)
df_over_agg

In [None]:
import plotly.graph_objects as go

df_over_agg["relative_overlap_time"] = (
    df_over_agg["total_overlap_time"] / df_over_agg["total_iteration_time"]
)
fig = px.box(
    df_over_agg,
    x=DF.overlap_rate,
    y="relative_overlap_time",
    color=RF.suite,
    facet_col=DF.env,
    hover_data=[RF.benchmark],
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="Overlap ratio",
    yaxis_title="Relative overlap time",
    legend_title="Suite",
    filename="figures/relative_overlap_time.pdf",
)
fig.show()

In [None]:
prep_rate = "prep_rate"
for df in [df_over_agg, df_over]:
    df[prep_rate] = "prep:"
    df.loc[df[prep_type] == False, prep_type] = "raw:"
    df[prep_rate] = df[prep_rate] + df[DF.overlap_rate].apply(
        lambda x: str(round(x * 100) if x else x)
    )

for suite in unique_suites:
    # Absolute overlap count by preprocessing level
    fig = px.bar(
        df_over_agg[df_over_agg[RF.suite] == suite],
        x=RF.benchmark,
        y="overlap_count",
        color=prep_rate,
        facet_col=DF.env,
        barmode="group",
        title=f"Overlap count by preprocessing levels - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

    # Relative overlap count to total iterations per preprocessing level
    df_over_agg["relative_overlap_count"] = (
        df_over_agg["overlap_count"] / df_over_agg["iteration_count"]
    )
    fig = px.bar(
        df_over_agg[df_over_agg[RF.suite] == suite],
        x=RF.benchmark,
        y="relative_overlap_count",
        color=prep_rate,
        facet_col=DF.env,
        barmode="group",
        title=f"Overlap count relative to total iterations by preprocessing levels - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

    # Relative overlap count to total iterations per preprocessing level
    df_over_agg["relative_overlap_time"] = (
        df_over_agg["total_overlap_time"] / df_over_agg["total_iteration_time"]
    )
    fig = px.bar(
        df_over_agg[df_over_agg[RF.suite] == suite],
        x=RF.benchmark,
        y="relative_overlap_time",
        color=prep_rate,
        facet_col=DF.env,
        barmode="group",
        title=f"Overlap time relative to total iteration time by preprocessing levels - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

    # Distribution of overlap times per preprocessing level
    fig = px.strip(
        df_over[(df_over[RF.suite] == suite) & df_over[prep_rate].isin(["prep:50"])],
        x=RF.overlap_time_ns,
        y=RF.benchmark,
        color=prep_rate,
        facet_col=DF.env,
        # points="outliers",
        orientation="h",
        title=f"Overlap time: {suite}",
    )
    fig.update_layout(height=1000)
    fig.update_yaxes(categoryorder="category descending")
    fig.show()