In [None]:
%load_ext autoreload
%autoreload 2

import plotly.express as px
import pandas as pd
import numpy as np
from scipy.stats import (
    bootstrap,
    gmean,
)
from IPython.display import display

from duet.process import *
from duet.constants import *
from common import *

df_prep = preprocess_data(load_raw())
unique_suites = df_prep[RF.suite].unique()

# CV - Relative Standard Deviation

In [None]:
df_cv = (
    df_prep.groupby(BENCHMARK_ENV_COL)
    .agg(
        time_count=(RF.time_ns, len),
        time_mean=(RF.time_ns, "mean"),
        time_var=(RF.time_ns, "var"),
        time_std=(RF.time_ns, "std"),
    )
    .reset_index()
)
df_cv["CV"] = df_cv["time_std"] / df_cv["time_mean"]

for suite in unique_suites:
    fig = px.bar(
        df_cv[df_cv[RF.suite] == suite],
        x=RF.benchmark,
        y="CV",
        facet_col=DF.env,
        color=RF.type,
        barmode="group",
        title=f"Benchmark Time Relative Deviation - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

In [None]:
import plotly.graph_objects as go

fig = px.box(
    translate(df_cv),
    x=RF.suite,
    y="CV",
    color=RF.type,
    facet_col=DF.env,
    hover_data=[RF.benchmark],
    color_discrete_map=colormap,
    category_orders=orders,
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="",
    yaxis_title="CV",
    legend_title="",
    filename="figures/cv.pdf",
)
fig.show()

# CI

### Determine best duet `overlap_rate`

In [None]:
if True:
    df_overlap_match = compute_ci(df_prep, np.arange(0.1, 1, 0.1))
else:
    df_overlap_match = compute_ci(df_prep, [0.5])

In [None]:
df = arbiter_ci_contains_zero(df_overlap_match)
df = (
    df.groupby(by=[DF.env, RF.suite, RF.type, DF.overlap_rate])
    .agg(
        total_count=(DF.match_ci, "count"),
        match_count=(DF.match_ci, "sum"),
        miss_err=(DF.err_ci, np.mean),
    )
    .reset_index()
)
df[DF.match_ratio_ci] = df["match_count"] / df["total_count"]

fig = px.line(
    translate(df),
    x=DF.overlap_rate,
    y=DF.match_ratio_ci,
    color=RF.suite,
    facet_col=DF.env,
    markers=True,
    color_discrete_map=colormap,
    category_orders=orders,
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="Minimum overlap ratio",
    yaxis_title="CI test A/A detection",
    legend_title="",
    filename="figures/citest_aa_match_by_overlap.pdf",
)
fig.show()

In [None]:
df = arbiter_utest(df_prep)
df = (
    df.groupby(by=[DF.env, RF.suite, RF.type])
    .agg(
        total_count=(DF.match_utest, "count"),
        match_count=(DF.match_utest, "sum"),
    )
    .reset_index()
)
df[DF.match_ratio_utest] = df["match_count"] / df["total_count"]
fig = px.bar(
    translate(df),
    x=RF.suite,
    y=DF.match_ratio_utest,
    color=RF.type,
    facet_col=DF.env,
    barmode="group",
    color_discrete_map=colormap,
    category_orders=orders,
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="",
    yaxis_title="u-test A/A detection",
    legend_title="",
    filename="figures/utest_aa_match.pdf",
)
fig.show()

In [None]:
print(df[df[RF.suite] != "speccpu"][DF.match_ratio_utest].mean())
print(df[df[RF.suite] == "speccpu"][DF.match_ratio_utest].mean())
print(
    df[(df[RF.suite] == "speccpu") & (df[DF.env] == "bare-metal")][
        DF.match_ratio_utest
    ].mean()
)
print(
    df[(df[RF.type] == "duet") & (df[RF.suite].isin(["dacapo", "scalabench"]))][
        DF.match_ratio_utest
    ].mean()
)
print(df[df[RF.type] == "syncduet"][DF.match_ratio_utest].mean())
print(df[df[RF.type] == "seqn"][DF.match_ratio_utest].mean())
print(df[df[RF.type] == "duet"][DF.match_ratio_utest].mean())

In [None]:
df_ci = df_overlap_match[
    df_overlap_match[DF.overlap_rate].isnull()
    | df_overlap_match[DF.overlap_rate].isin([0.4])
]
df_ci

In [None]:
df_pred_ci = arbiter_ci_contains_zero(df_ci)
df = group_predictions(df_pred_ci, utest=False)
fig = px.bar(
    translate(df),
    x=RF.suite,
    y=DF.match_ratio_ci,
    facet_col=DF.env,
    color=RF.type,
    barmode="group",
    color_discrete_map=colormap,
    category_orders=orders,
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="",
    yaxis_title="CI test A/A detection",
    legend_title="",
    filename="figures/citest_aa_match.pdf",
)
fig.show()

In [None]:
df[DF.match_ratio_ci].mean()

## CI test per benchmark

In [None]:
type = "type:pairing"
df_ci[type] = df_ci[RF.type] + ":" + df_ci[DF.overlap_rate].astype(str)

for suite in unique_suites:
    suite_mask = df_ci[RF.suite] == suite
    if suite_mask.any():
        fig = px.scatter(
            df_ci[suite_mask],
            x="benchmark",
            y="mid",
            error_y="err",
            color=type,
            facet_col=DF.env,
            title=f"CI - {suite}",
        )
        fig.update_xaxes(categoryorder="category ascending")
        fig.show()

In [None]:
for suite in unique_suites:
    fig = px.bar(
        df_ci[df_ci[RF.suite] == suite],
        x=RF.benchmark,
        y=DF.ci_width,
        color=type,
        facet_col=DF.env,
        barmode="group",
        title=f"Relative CI Width comparison - {suite}",
    )
    fig.show()

In [None]:
fig = px.box(
    translate(df_ci),
    x=RF.suite,
    y=DF.ci_width,
    color=RF.type,
    facet_col=DF.env,
    hover_data=[RF.benchmark],
    color_discrete_map=colormap,
    category_orders=orders,
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="",
    yaxis_title="Relative CI width",
    legend_title="",
    filename="figures/ci_width.pdf",
)
fig.show()

In [None]:
df = pd.pivot_table(
    df_ci,
    columns=[RF.type],
    values=DF.ci_width_absolute,
    index=[RF.suite, RF.benchmark, DF.env],
).reset_index()
df["sync_improvement"] = df["Sequential"] - df["Synchronous duet"]
df["async_improvement"] = df["Sequential"] - df["Asynchronous duet"]
df["sync_improvement_relative"] = (df["sync_improvement"] / df["Sequential"]) * 100
df["async_improvement_relative"] = (df["async_improvement"] / df["Sequential"]) * 100
df_absolute = df.melt(
    id_vars=[RF.suite, RF.benchmark, DF.env],
    value_vars=["sync_improvement", "async_improvement"],
    value_name="improvement",
)
df_relative = df.melt(
    id_vars=[RF.suite, RF.benchmark, DF.env],
    value_vars=["sync_improvement_relative", "async_improvement_relative"],
    value_name="improvement",
)

for suite in ["Renaissance", "DaCapo", "Scalabench", "SPEC CPU"]:
    display(
        px.bar(
            df_absolute[df_absolute[RF.suite] == suite],
            x=RF.benchmark,
            y="improvement",
            color="type",
            facet_col=DF.env,
            barmode="group",
        )
    )
    display(
        px.bar(
            df_relative[df_relative[RF.suite] == suite],
            x=RF.benchmark,
            y="improvement",
            color="type",
            facet_col=DF.env,
            barmode="group",
        )
    )

display(
    df.groupby(RF.suite).agg(
        sync_relative_mean_improvement=("sync_improvement_relative", "mean"),
        async_relative_mean_improvement=("async_improvement_relative", "mean"),
    )
)
display(
    df.groupby(RF.suite).agg(
        sync_mean_improvement=("sync_improvement", "mean"),
        async_mean_improvement=("async_improvement", "mean"),
    )
)

In [None]:
df = df_ci[df_ci[RF.suite] == "SPEC CPU"]
df["mid"] = df["mid"] / (10**9)
df["err"] = df["err"] / (10**9)
fig = px.scatter(
    translate(df),
    x="benchmark",
    y="mid",
    error_y="err",
    color=RF.type,
    facet_col=DF.env,
    template="plotly_white",
    color_discrete_map=colormap,
    category_orders={**{DF.env: ["bare-metal", "AWS t3.medium"]}, **order_type},
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="",
    yaxis_title="CI (s)",
    legend_title="",
    filename="figures/ci_example_benchmark.pdf",
)
fig.show()

df = arbiter_ci_contains_zero(df)
display(df[df[DF.match_ci] == False])
display(df[df[DF.match_ci] == False].shape[0] / 16)

---

## Backup

---

In [None]:
# These errors are not relative
fig = px.box(
    df_ci,
    x=RF.suite,
    y="err",
    facet_col=DF.env,
    color=RF.type,
    hover_data=[RF.benchmark],
    color_discrete_map=colormap,
    category_orders=orders,
)
fig = save_fig_facet_col_env(
    fig,
    xaxis_title="",
    yaxis_title="CI test error",
    legend_title="",
    filename="figures/ci_test_error.pdf",
)
fig.show()

In [None]:
df_pred_utest = arbiter_utest(df_prep)
df = group_predictions(df_pred_utest, ci=False)
px.bar(
    df,
    x=RF.suite,
    y=DF.match_ratio_ci,
    facet_col=DF.env,
    color=RF.type,
    barmode="group",
    title="Correct A/A detection ratio",
)

In [None]:
df = preprocess_data(df_prep)

df = (
    df.groupby(ARTIFACT_COL + RUN_ID_COL)
    .agg(
        time_count=(RF.time_ns, len),
        time_mean=(RF.time_ns, "mean"),
        time_var=(RF.time_ns, "var"),
        time_std=(RF.time_ns, "std"),
    )
    .reset_index()
)
df["CV"] = df["time_std"] / df["time_mean"]

for suite in unique_suites:
    fig = px.box(
        df[df[RF.suite] == suite],
        x=RF.benchmark,
        y="CV",
        facet_row=DF.env,
        color=RF.type,
        title=f"Benchmark Time Relative Deviation per run - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

In [None]:
df_ci_syncduet = df_prep[df_prep[RF.type] == "syncduet"]
if df_ci_syncduet.shape[0] == 0:
    print("No runs")
    raise StopExecution

df_ci_syncduet = compute_ci_pair_speedup(df_ci_syncduet, sample_type="run_means")

for suite in unique_suites:
    suite_mask = df_ci_syncduet[RF.suite] == suite
    if suite_mask.any():
        fig = px.scatter(
            df_ci_syncduet[suite_mask],
            x=RF.benchmark,
            y="mid",
            error_y="err",
            color=DF.env,
            title=f"Syncduet pairwise speedup CI - {suite}",
        )
        fig.show()