In [1]:
import pandas as pd
import numpy as np

# Load all 6 files
csv_files = [
    "/kaggle/input/dataset/01_Model_Evaluation_Result_1 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_2 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_3 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_4 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_5 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_6 (1).csv",
]

metrics = ["TruthfulQA", "HHEMRate", "Medical", "Legal", "Scientific", "Communication Clarity"]

# Aggregate data
data = {}
for f in csv_files:
    df = pd.read_csv(f)
    if df.columns[0] != "Model":
        df = df.rename(columns={df.columns[0]: "Model"})
    
    for _, row in df.iterrows():
        model = row["Model"]
        if model not in data:
            data[model] = {m: [] for m in metrics}
        for m in metrics:
            data[model][m].append(float(row[m]))

# Bootstrap confidence intervals
def bootstrap_ci(data, n_bootstrap=1000, ci=95):
    """Calculate bootstrap confidence interval"""
    np.random.seed(42)
    bootstrap_means = []
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrap_means.append(np.mean(sample))
    
    lower = np.percentile(bootstrap_means, (100 - ci) / 2)
    upper = np.percentile(bootstrap_means, 100 - (100 - ci) / 2)
    
    return lower, upper

# Generate table
print("=" * 100)
print("BOOTSTRAP CONFIDENCE INTERVALS (95% CI)")
print("=" * 100)

results = []
for model in sorted(data.keys()):
    row = {"Model": model}
    
    for metric in metrics:
        scores = data[model][metric]
        lower, upper = bootstrap_ci(scores)
        row[f"{metric} [Lower, Upper]"] = f"[{lower:.1f}, {upper:.1f}]"
    
    results.append(row)

# Create DataFrame
ci_df = pd.DataFrame(results)

# Display
print(ci_df.to_string(index=False))
print("\n" + "=" * 100)
print("Note: Based on 1000 bootstrap samples from 6 evaluation runs")
print("=" * 100)

ci_df

BOOTSTRAP CONFIDENCE INTERVALS (95% CI)
       Model TruthfulQA [Lower, Upper] HHEMRate [Lower, Upper] Medical [Lower, Upper] Legal [Lower, Upper] Scientific [Lower, Upper] Communication Clarity [Lower, Upper]
deepseek-llm              [52.7, 53.0]              [3.9, 4.8]           [19.4, 20.8]         [18.7, 21.8]              [14.4, 14.8]                           [1.9, 2.9]
    gemma:7b              [50.0, 50.5]              [1.9, 2.2]           [21.0, 23.1]         [11.4, 12.7]              [18.1, 18.5]                           [2.9, 3.9]
   llama3:8b              [56.0, 56.7]              [6.8, 7.1]           [26.9, 30.2]         [26.2, 27.8]              [16.6, 17.1]                           [0.0, 0.6]
  mistral:7b              [53.3, 53.3]              [5.2, 5.6]           [29.3, 32.2]         [27.3, 29.1]              [18.2, 19.2]                           [1.6, 2.4]
  qwen2.5:3b              [51.4, 52.4]              [3.7, 4.6]           [31.6, 33.9]         [30.1, 32.9]    

Unnamed: 0,Model,"TruthfulQA [Lower, Upper]","HHEMRate [Lower, Upper]","Medical [Lower, Upper]","Legal [Lower, Upper]","Scientific [Lower, Upper]","Communication Clarity [Lower, Upper]"
0,deepseek-llm,"[52.7, 53.0]","[3.9, 4.8]","[19.4, 20.8]","[18.7, 21.8]","[14.4, 14.8]","[1.9, 2.9]"
1,gemma:7b,"[50.0, 50.5]","[1.9, 2.2]","[21.0, 23.1]","[11.4, 12.7]","[18.1, 18.5]","[2.9, 3.9]"
2,llama3:8b,"[56.0, 56.7]","[6.8, 7.1]","[26.9, 30.2]","[26.2, 27.8]","[16.6, 17.1]","[0.0, 0.6]"
3,mistral:7b,"[53.3, 53.3]","[5.2, 5.6]","[29.3, 32.2]","[27.3, 29.1]","[18.2, 19.2]","[1.6, 2.4]"
4,qwen2.5:3b,"[51.4, 52.4]","[3.7, 4.6]","[31.6, 33.9]","[30.1, 32.9]","[17.9, 19.4]","[0.0, 0.0]"
