In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
actual_csv = 'bb_test_sample.csv' #Path to the actual instance-feature file for a specific algorithm that is used for prediction by llm
pred_csv   = './results_max_kp/gemini/Gemini_bb2.csv' #Path of the prediction output file
output_csv = './llm50_results/gemini/bb_eval.csv' #Path where the output of this notebook will be saved


actual_df = pd.read_csv(actual_csv)
pred_df   = pd.read_csv(pred_csv)


y_true_time    = actual_df['solution_time'].to_numpy()
y_pred_time    = pred_df['pred_solution_time'].to_numpy()

y_true_mem     = actual_df['peak_memory'].to_numpy()
y_pred_mem     = pred_df['pred_memory_kb'].to_numpy()

y_true_gap     = actual_df['optimality_gap'].to_numpy()
y_pred_gap     = pred_df['pred_optimality_gap'].to_numpy()

In [None]:
results = []
for name, y_true, y_pred in [
    ("solution_time",  y_true_time, y_pred_time),
    ("peak_memory",    y_true_mem,  y_pred_mem),
    ("optimality_gap", y_true_gap,  y_pred_gap),
]:
    # Boolean mask: neither true nor pred is NaN
    mask = (~np.isnan(y_true)) & (~np.isnan(y_pred))
    y_t, y_p = y_true[mask], y_pred[mask]

    mae  = mean_absolute_error(y_t, y_p)
    mse = mean_squared_error(y_t, y_p)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_t, y_p)

    results.append({
        'metric': name,
        'n_samples': int(mask.sum()),
        'MAE':  mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2':   r2
    })


metrics_df = pd.DataFrame(results)
metrics_df.to_csv(output_csv, index=False)
print(f"Saved {output_csv}")