# RQ3: Accuracy improvements

In [20]:
%load_ext autoreload
%autoreload 2

import plotly.express as px
import pandas as pd
import numpy as np
from scipy.stats import (
    bootstrap,
    gmean,
)
from IPython.display import display

from duet.process import *
from duet.constants import *
from common import *

df_prep = preprocess_data(load_raw())
unique_suites = df_prep[RF.suite].unique()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Relative Standard Deviation (CV)

In [21]:
df_cv = (
    df_prep.groupby(BENCHMARK_ENV_COL)
    .agg(
        time_count=(RF.time_ns, len),
        time_mean=(RF.time_ns, "mean"),
        time_var=(RF.time_ns, "var"),
        time_std=(RF.time_ns, "std"),
    )
    .reset_index()
)
df_cv["CV"] = df_cv["time_std"] / df_cv["time_mean"]

for suite in unique_suites:
    fig = px.bar(
        df_cv[df_cv[RF.suite] == suite],
        x=RF.benchmark,
        y="CV",
        facet_col=DF.env,
        color=RF.type,
        barmode="group",
        title=f"Benchmark Time Relative Deviation - {suite}",
    )
    fig.update_xaxes(categoryorder="category ascending")
    fig.show()

In [22]:
px.box(
    df_cv,
    x=RF.suite,
    y="CV",
    color=RF.type,
    facet_col=DF.env,
    hover_data=[RF.benchmark],
)

## Confidence intervals computation

### Determine best duet `overlap_rate`

In [23]:
df_overlap_match = compute_ci(df_prep, np.arange(0.1, 1, 0.1))

In [24]:
df = arbiter_ci_contains_zero(df_overlap_match)
df = (
    df.groupby(by=[DF.env, RF.suite, RF.type, DF.overlap_rate])
    .agg(
        total_count=(DF.match_ci, "count"),
        match_count=(DF.match_ci, "sum"),
        miss_err=(DF.err_ci, np.mean),
    )
    .reset_index()
)
df[DF.match_ratio_ci] = df["match_count"] / df["total_count"]

px.line(
    df,
    x=DF.overlap_rate,
    y=DF.match_ratio_ci,
    color=RF.suite,
    facet_col=DF.env,
    markers=True,
)

### Select minimum overlap ratio to `0.4`

In [25]:
df_ci = df_overlap_match[
    df_overlap_match[DF.overlap_rate].isnull()
    | df_overlap_match[DF.overlap_rate].isin([0.4])
]
df_ci

Unnamed: 0,index,suite,benchmark,type,environment,lo,hi,err,mid,se,grand_mean,ci_width,minumum_overlap_ratio,err_ci,match_ci
0,0,dacapo,avrora,seqn,AWS t3.medium,-3.533665e+07,1.070543e+07,2.302104e+07,-1.231561e+07,1.151643e+07,4.163510e+09,0.011058,,0.000000e+00,True
1,1,dacapo,avrora,seqn,bare-metal,-2.494884e+06,2.053752e+07,1.151620e+07,9.021317e+06,5.695647e+06,3.084214e+09,0.007468,,0.000000e+00,True
2,2,dacapo,avrora,seqn,shared-vm,-3.611791e+07,5.512028e+07,4.561910e+07,9.501187e+06,2.211577e+07,4.555183e+09,0.020030,,0.000000e+00,True
3,3,dacapo,avrora,syncduet,AWS t3.medium,3.550402e+06,7.366091e+07,3.505525e+07,3.860566e+07,1.653485e+07,5.866218e+09,0.011952,,3.550402e+06,False
4,4,dacapo,avrora,syncduet,bare-metal,-9.708846e+06,6.217225e+06,7.963035e+06,-1.745811e+06,4.053109e+06,3.280338e+09,0.004855,,0.000000e+00,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,190,speccpu,549_fotonik3d_r,duet,bare-metal,-1.147782e+08,-1.071596e+07,5.203113e+07,-6.274709e+07,2.591516e+07,4.160210e+11,0.000250,0.4,-1.147782e+08,False
1066,191,speccpu,554_roms_r,duet,AWS t3.medium,-1.258556e+08,9.285631e+07,1.093559e+08,-1.649963e+07,5.248778e+07,4.842515e+11,0.000452,0.4,0.000000e+00,True
1067,192,speccpu,554_roms_r,duet,bare-metal,-7.746519e+08,1.506833e+08,4.626676e+08,-3.119843e+08,2.343789e+08,2.406419e+11,0.003845,0.4,0.000000e+00,True
1068,193,speccpu,557_xz_r,duet,AWS t3.medium,7.920257e+07,7.376643e+08,3.292308e+08,4.084334e+08,1.584903e+08,6.798125e+11,0.000969,0.4,7.920257e+07,False


### CI test

In [26]:
df_pred_ci = arbiter_ci_contains_zero(df_ci)
df = group_predictions(df_pred_ci, utest=False)
px.bar(
    df,
    x=RF.suite,
    y=DF.match_ratio_ci,
    facet_col=DF.env,
    color=RF.type,
    barmode="group",
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
for suite in unique_suites:
    suite_mask = df_ci[RF.suite] == suite
    if suite_mask.any():
        fig = px.scatter(
            df_ci[suite_mask],
            x="benchmark",
            y="mid",
            error_y="err",
            color=RF.type,
            facet_col=DF.env,
            title=f"CI - {suite}",
        )
        fig.update_xaxes(categoryorder="category ascending")
        fig.show()

### Relative CI width

In [28]:
px.box(
    df_ci,
    x=RF.suite,
    y=DF.ci_width,
    color=RF.type,
    facet_col=DF.env,
    hover_data=[RF.benchmark],
    title="Relative CI width per suite"
)

In [30]:
for suite in unique_suites:
    fig = px.bar(
        df_ci[df_ci[RF.suite] == suite],
        x=RF.benchmark,
        y=DF.ci_width,
        color=RF.type,
        facet_col=DF.env,
        barmode="group",
        title=f"Relative CI Width comparison - {suite}",
    )
    fig.show()

### CI error

In [31]:
px.box(
    df_ci,
    x=RF.suite,
    y="err",
    facet_col=DF.env,
    color=RF.type,
    hover_data=[RF.benchmark],
)

## Mann-Whitney u-test

In [36]:
df_utest = arbiter_utest(df_prep)
df_utest_grouped = (
    df_utest.groupby(by=[DF.env, RF.suite, RF.type])
    .agg(
        total_count=(DF.match_utest, "count"),
        match_count=(DF.match_utest, "sum"),
    )
    .reset_index()
)
df_utest_grouped[DF.match_ratio_utest] = df_utest_grouped["match_count"] / df_utest_grouped["total_count"]
px.bar(
    df_utest_grouped,
    x=RF.type,
    y=DF.match_ratio_utest,
    color=RF.suite,
    facet_col=DF.env,
    barmode="group",
)

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['environment', 'suite', 'type', 'total_count_ci', 'match_ci', 'err_ci', 'match_ci_%'] but received: match_utest_%

In [None]:
for suite in unique_suites:
    suite_mask = df_utest[RF.suite] == suite
    if suite_mask.any():
        fig = px.bar(
            df_utest[suite_mask],
            x="benchmark",
            y="mid",
            color=RF.type,
            facet_col=DF.env,
            title=f"u-test - {suite}",
        )
        fig.update_xaxes(categoryorder="category ascending")
        fig.show()