In [None]:
%load_ext autoreload
%autoreload 2

import plotly.express as px
import pandas as pd
import numpy as np
from scipy.stats import (
    bootstrap,
    gmean,
)
from IPython.display import display

from duet.process import *
from duet.constants import *
from common import *

df_raw = load_raw()
unique_suites = df_raw[RF.suite].unique()
df_raw.describe()

# Overlaps

In [None]:
iter_overlap_pair_dfs = []

prep_type = "preprocess"
df_raw_overlaps = compute_overlaps(df_raw)
df_raw_overlaps[prep_type] = "raw"
iter_overlap_pair_dfs.append((df_raw_overlaps, df_raw))

df_prep = preprocess_data(df_raw)
df_prep_overlaps = compute_overlaps(df_prep)
df_prep_overlaps[prep_type] = "preprocess data"
iter_overlap_pair_dfs.append((df_prep_overlaps, df_prep))

for overlap_ratio in [0.5, 0.8, 0.9]:
    df_prep_overlaps_prep = preprocess_overlaps(df_prep_overlaps, p=overlap_ratio)
    df_prep_overlaps_prep[
        prep_type
    ] = f"preprocess overlaps {(overlap_ratio * 100):.0f}%"
    iter_overlap_pair_dfs.append((df_prep_overlaps_prep, df_prep))

df_over = pd.concat([pair[0] for pair in iter_overlap_pair_dfs])
df_over

In [None]:
def aggregate_size_and_time(df_over, df_iter, prep_type_col):
    """Compute iteration/overlap aggregate count and total time"""
    df_over_count = (
        df_over.groupby(BENCHMARK_ENV_COL + [prep_type_col])
        .size()
        .reset_index(name="overlap_count")
    )
    df_over_agg = df_over_count

    df_over_time = (
        df_over.groupby(BENCHMARK_ENV_COL + [prep_type_col])[RF.overlap_time_ns]
        .sum()
        .reset_index(name="total_overlap_time")
    )
    df_over_agg = df_over_agg.merge(
        df_over_time, on=BENCHMARK_ENV_COL + [prep_type_col]
    )

    df_iter_count = (
        df_iter.groupby(BENCHMARK_ENV_COL).size().reset_index(name="iteration_count")
    )
    df_over_agg = df_over_agg.merge(df_iter_count, on=BENCHMARK_ENV_COL)

    df_iter_time = (
        df_iter.groupby(BENCHMARK_ENV_COL)[RF.time_ns]
        .sum()
        .reset_index(name="total_iteration_time")
    )
    df_over_agg = df_over_agg.merge(df_iter_time, on=BENCHMARK_ENV_COL)

    return df_over_agg


df_over_agg = pd.concat(
    [
        aggregate_size_and_time(df_overlaps, df_iters, prep_type)
        for df_overlaps, df_iters in iter_overlap_pair_dfs
    ]
)
df_over_agg

In [None]:
for suite in unique_suites:
    # Absolute overlap count by preprocessing level
    fig = px.bar(
        df_over_agg[df_over_agg[RF.suite] == suite],
        x=RF.benchmark,
        y="overlap_count",
        color=prep_type,
        facet_row=DF.env,
        barmode="group",
        title=f"Overlap count by preprocessing levels - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

    # Relative overlap count to total iterations per preprocessing level
    df_over_agg["relative_overlap_count"] = (
        df_over_agg["overlap_count"] / df_over_agg["iteration_count"]
    )
    fig = px.bar(
        df_over_agg[df_over_agg[RF.suite] == suite],
        x=RF.benchmark,
        y="relative_overlap_count",
        color=prep_type,
        facet_row=DF.env,
        barmode="group",
        title=f"Overlap count relative to total iterations by preprocessing levels - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

    # Relative overlap count to total iterations per preprocessing level
    df_over_agg["relative_overlap_time"] = (
        df_over_agg["total_overlap_time"] / df_over_agg["total_iteration_time"]
    )
    fig = px.bar(
        df_over_agg[df_over_agg[RF.suite] == suite],
        x=RF.benchmark,
        y="relative_overlap_time",
        color=prep_type,
        facet_row=DF.env,
        barmode="group",
        title=f"Overlap time relative to total iteration time by preprocessing levels - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

    # Distribution of overlap times per preprocessing level
    fig = px.violin(
        df_over[df_over[RF.suite] == suite],
        x=RF.overlap_time_ns,
        y=RF.benchmark,
        color=prep_type,
        facet_col=DF.env,
        points="all"
        if df_over[df_over[RF.suite] == suite].shape[0] < 10000
        else "outliers",
        orientation="h",
        title=f"Overlap time: {suite}",
    )
    fig.update_layout(height=4000)
    fig.update_yaxes(categoryorder="category descending")
    fig.show()