In [5]:
import pandas as pd

In [6]:
dfb = pd.read_csv("benchmark_dataset.csv")

In [8]:
# avg_cpu_percent is typically reported as "percent of 1 core"
# e.g., 800% ~ 8 cores used on average
dfb["avg_cpu_cores"] = dfb["avg_cpu_percent"] / 100.0

In [9]:
dfb.head(5)

Unnamed: 0,complex_id,A_BIN,MW_BIN,num_assemblies,num_pairs,tests_run,avg_time_sec,max_time_sec,avg_rss_mb,max_rss_mb,avg_cpu_percent,est_total_wall_hours,avg_spectrum_sec,max_spectrum_sec,avg_descriptor_sec,max_descriptor_sec,avg_fm_sec,max_fm_sec,avg_cpu_cores
0,PDB-CPX-184429,A1_3-4,MW1_<60,4,6,5,19.43,23.35,341.51,354.7,768.8,0.03,4.0,4.12,0.12,0.14,5.49,6.91,7.688
1,PDB-CPX-124838,A1_3-4,MW2_60-120,4,6,5,33.22,40.55,494.11,519.92,1042.6,0.06,11.27,12.18,0.3,0.3,13.18,20.91,10.426
2,PDB-CPX-118509,A1_3-4,MW3_120-300,4,6,5,66.39,68.83,732.38,755.02,1263.2,0.11,30.76,32.22,0.67,0.69,25.94,28.54,12.632
3,PDB-CPX-177941,A1_3-4,MW4_>300,4,6,5,111.6,155.92,1280.94,1378.84,1257.6,0.19,37.68,53.65,1.52,2.17,62.67,91.66,12.576
4,PDB-CPX-126721,A2_5-8,MW1_<60,5,10,5,14.09,14.88,354.49,384.95,647.2,0.04,2.83,2.86,0.09,0.1,3.16,4.01,6.472


In [11]:
# Overall stats across bin combinations
overall = {
    "avg_total_time_sec": dfb["avg_time_sec"].mean(),
    "avg_spectrum_sec": dfb["avg_spectrum_sec"].mean(),
    "avg_descriptor_sec": dfb["avg_descriptor_sec"].mean(),
    "avg_fm_sec": dfb["avg_fm_sec"].mean(),
    "avg_cpu_cores_used": dfb["avg_cpu_cores"].mean(),
}

# "Other/overhead" is the remainder after the 3 timed steps
overall["avg_other_overhead_sec"] = (
    overall["avg_total_time_sec"]
    - (overall["avg_spectrum_sec"] + overall["avg_descriptor_sec"] + overall["avg_fm_sec"])
)

# Percent contributions (global, unweighted mean of ratios per bin can differ;
# here we compute from the overall means for a clean narrative figure)
overall["spectrum_pct_of_total"] = overall["avg_spectrum_sec"] / overall["avg_total_time_sec"] * 100
overall["descriptor_pct_of_total"] = overall["avg_descriptor_sec"] / overall["avg_total_time_sec"] * 100
overall["fm_pct_of_total"] = overall["avg_fm_sec"] / overall["avg_total_time_sec"] * 100
overall["other_pct_of_total"] = overall["avg_other_overhead_sec"] / overall["avg_total_time_sec"] * 100

overall_df = (
    pd.DataFrame([overall])
    .round(2)
)

overall_df

Unnamed: 0,avg_total_time_sec,avg_spectrum_sec,avg_descriptor_sec,avg_fm_sec,avg_cpu_cores_used,avg_other_overhead_sec,spectrum_pct_of_total,descriptor_pct_of_total,fm_pct_of_total,other_pct_of_total
0,106.91,34.18,0.8,62.63,10.77,9.3,31.97,0.75,58.58,8.7


In [18]:
# Calculate step percentage over bins combination
dfb["spectrum_pct"] = dfb["avg_spectrum_sec"] / dfb["avg_time_sec"] * 100
dfb["descriptor_pct"] = dfb["avg_descriptor_sec"] / dfb["avg_time_sec"] * 100
dfb["fm_pct"] = dfb["avg_fm_sec"] / dfb["avg_time_sec"] * 100

dfb["other_pct"] = 100 - (
    dfb["spectrum_pct"] +
    dfb["descriptor_pct"] +
    dfb["fm_pct"]
)

df_pct = dfb[[
    "A_BIN",
    "MW_BIN",
    "avg_time_sec",
    "avg_spectrum_sec",
    "spectrum_pct",
    "avg_fm_sec",
    "fm_pct"
]].round(2)

df_pct.sort_values(["MW_BIN", "A_BIN"])

Unnamed: 0,A_BIN,MW_BIN,avg_time_sec,avg_spectrum_sec,spectrum_pct,avg_fm_sec,fm_pct
0,A1_3-4,MW1_<60,19.43,4.0,20.59,5.49,28.26
4,A2_5-8,MW1_<60,14.09,2.83,20.09,3.16,22.43
8,A3_9-20,MW1_<60,20.04,4.74,23.65,6.68,33.33
12,A4_21-100,MW1_<60,19.76,4.57,23.13,6.62,33.5
16,A5_101+,MW1_<60,21.44,4.96,23.13,7.56,35.26
1,A1_3-4,MW2_60-120,33.22,11.27,33.93,13.18,39.67
5,A2_5-8,MW2_60-120,31.68,10.29,32.48,12.76,40.28
9,A3_9-20,MW2_60-120,28.27,10.66,37.71,8.48,30.0
13,A4_21-100,MW2_60-120,34.35,12.02,34.99,13.31,38.75
17,A5_101+,MW2_60-120,33.54,9.82,29.28,14.7,43.83
