In [2]:
import numpy as np
import json
import matplotlib.pyplot as plt
import os
import pandas as pd

# Aggregated Results
Aggregated results from running all 128 synthetic benchmark problems in Trition with the specifications given by Foldager et al. (2023). We observe that the results do improve a bit in the average results.

In [3]:
df_fixed = pd.read_csv("../results/aggregated_GP-EI-BENCHMARKS-NUMWARNING_FIXED.csv")
df_notfixed = pd.read_csv("../results/aggregated_GP-EI-BENCHMARKS-NUMWARNING_NOT_FIXED.csv")

In [7]:
metric_columns = [
    'best_simple_regret_pool',
    'final_cumulative_regret_pool',
    'final_calibration_mse',
    'final_sharpness',
    'final_nmse',
    'final_elpd'
]

print("--- Aggregating results per problem for FIXED data ---")

avg_metrics_by_problem_fixed = df_fixed.groupby('problem_idx')[metric_columns].mean()
sem_metrics_by_problem_fixed = df_fixed.groupby('problem_idx')[metric_columns].sem()

print("\n" + "="*50 + "\n")

print("--- Aggregating results per problem for NOT FIXED data ---")

avg_metrics_by_problem_notfixed = df_notfixed.groupby('problem_idx')[metric_columns].mean()
sem_metrics_by_problem_notfixed = df_notfixed.groupby('problem_idx')[metric_columns].sem()

avg_metrics_by_problem_fixed.head()

--- Aggregating results per problem for FIXED data ---


--- Aggregating results per problem for NOT FIXED data ---


Unnamed: 0_level_0,best_simple_regret_pool,final_cumulative_regret_pool,final_calibration_mse,final_sharpness,final_nmse,final_elpd
problem_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.108784,12.941688,0.013562,0.172969,0.106064,-0.282345
1,0.052776,9.530808,0.00556,0.164023,0.038879,0.464513
2,0.074222,9.91654,0.010577,0.184173,0.073857,0.123239
3,0.066955,8.95729,0.005592,0.168261,0.053208,0.384657
4,0.085003,10.501378,0.009532,0.219838,0.114629,-0.553094


In [8]:
avg_metrics_by_problem_notfixed.head()

Unnamed: 0_level_0,best_simple_regret_pool,final_cumulative_regret_pool,final_calibration_mse,final_sharpness,final_nmse,final_elpd
problem_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.108518,12.592874,0.013679,0.166665,0.076892,0.353347
1,0.050775,9.253069,0.006228,0.164521,0.038887,0.448238
2,0.061339,9.237527,0.008753,0.185089,0.073668,0.116829
3,0.068394,10.069988,0.005822,0.170872,0.054515,0.37489
4,0.0941,10.995724,0.010143,0.225724,0.107448,-0.509454


In [9]:
metric_columns = [
    'best_simple_regret_pool',
    'final_cumulative_regret_pool',
    'final_calibration_mse',
    'final_sharpness',
    'final_nmse',
    'final_elpd'
]

print("--- Summary for FIXED data (Mean and Standard Error of the Mean) ---")
for col in metric_columns:
    if col in df_fixed.columns:
        mean_val = df_fixed[col].mean()
        std_val = df_fixed[col].std()
        n_runs = len(df_fixed[col])
        
        sem_val = std_val / np.sqrt(n_runs)
        
        print(f"{col}:")
        print(f"  - Mean:      {mean_val:.4f}")
        print(f"  - Std Error: {sem_val:.4f} (from {n_runs} runs)")

print("\n" + "="*30 + "\n")

print("--- Summary for NOT FIXED data (Mean and Standard Error of the Mean) ---")
for col in metric_columns:
    if col in df_notfixed.columns:
        mean_val = df_notfixed[col].mean()
        std_val = df_notfixed[col].std()
        n_runs = len(df_notfixed[col])
        
        sem_val = std_val / np.sqrt(n_runs)
        
        print(f"{col}:")
        print(f"  - Mean:      {mean_val:.4f}")
        print(f"  - Std Error: {sem_val:.4f} (from {n_runs} runs)")

--- Summary for FIXED data (Mean and Standard Error of the Mean) ---
best_simple_regret_pool:
  - Mean:      0.0322
  - Std Error: 0.0010 (from 2559 runs)
final_cumulative_regret_pool:
  - Mean:      12.0259
  - Std Error: 0.3196 (from 2559 runs)
final_calibration_mse:
  - Mean:      0.0180
  - Std Error: 0.0005 (from 2559 runs)
final_sharpness:
  - Mean:      0.3662
  - Std Error: 0.0036 (from 2559 runs)
final_nmse:
  - Mean:      0.3290
  - Std Error: 0.0088 (from 2559 runs)
final_elpd:
  - Mean:      -2.4315
  - Std Error: 0.1819 (from 2559 runs)


--- Summary for NOT FIXED data (Mean and Standard Error of the Mean) ---
best_simple_regret_pool:
  - Mean:      0.0327
  - Std Error: 0.0011 (from 2555 runs)
final_cumulative_regret_pool:
  - Mean:      12.5127
  - Std Error: 0.3355 (from 2555 runs)
final_calibration_mse:
  - Mean:      0.0180
  - Std Error: 0.0005 (from 2555 runs)
final_sharpness:
  - Mean:      0.3649
  - Std Error: 0.0037 (from 2555 runs)
final_nmse:
  - Mean:      0.