In [3]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import ttest_rel

csv_files = [
   "/kaggle/input/dataset/01_Model_Evaluation_Result_1 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_2 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_3 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_4 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_5 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_6 (1).csv",
]

metrics = ["TruthfulQA", "HHEMRate", "Medical", "Legal", "Scientific", "Communication Clarity"]
alpha = 0.05

dfs = [pd.read_csv(f) for f in csv_files]

data = {}

for df in dfs:
    # rename first column to "Model" if it's unnamed
    if df.columns[0] != "Model":
        df = df.rename(columns={df.columns[0]: "Model"})
        
    for _, row in df.iterrows():
        model = row["Model"]
        if model not in data:
            data[model] = {m: [] for m in metrics}
        for m in metrics:
            data[model][m].append(float(row[m]))

def cohens_d_paired(x, y):
    diff = np.array(x) - np.array(y)
    return diff.mean() / (diff.std(ddof=1) + 1e-8)

results = []
models = list(data.keys())
pairs = list(combinations(models, 2))
bonf_alpha = alpha / len(pairs)

for m1, m2 in pairs:
    row = {"Model 1": m1, "Model 2": m2}
    sig_metrics = []
    
    for metric in metrics:
        x = np.array(data[m1][metric])
        y = np.array(data[m2][metric])
        _, p_val = ttest_rel(x, y)
        row[f"{metric} p-value"] = round(p_val, 5)
        if p_val < bonf_alpha:
            sig_metrics.append(metric)
    
    d = cohens_d_paired(data[m1]["TruthfulQA"], data[m2]["TruthfulQA"])
    row["Effect Size (Cohen's d, TQ)"] = round(d, 3)
    
    row["Significance (Bonferroni)"] = (
        "Not significant" if len(sig_metrics) == 0
        else "Significant (" + ", ".join(sig_metrics) + ")"
    )
    
    results.append(row)

pairwise_df = pd.DataFrame(results)
pairwise_df


Unnamed: 0,Model 1,Model 2,TruthfulQA p-value,HHEMRate p-value,Medical p-value,Legal p-value,Scientific p-value,Communication Clarity p-value,"Effect Size (Cohen's d, TQ)",Significance (Bonferroni)
0,deepseek-llm,mistral:7b,0.0041,0.03725,0.00038,0.00078,4e-05,0.10801,-2.041,"Significant (TruthfulQA, Medical, Legal, Scien..."
1,deepseek-llm,llama3:8b,4e-05,0.0002,2e-05,0.00129,0.0002,0.00314,-5.446,"Significant (TruthfulQA, HHEMRate, Medical, Le..."
2,deepseek-llm,gemma:7b,0.0,0.00092,0.0613,0.00063,1e-05,0.14146,9.151,"Significant (TruthfulQA, HHEMRate, Legal, Scie..."
3,deepseek-llm,qwen2.5:3b,0.01183,0.34812,0.0,5e-05,0.00072,0.00032,1.577,"Significant (Medical, Legal, Scientific, Commu..."
4,mistral:7b,llama3:8b,4e-05,0.00019,0.24671,0.01028,0.00805,0.00552,-5.496,"Significant (TruthfulQA, HHEMRate)"
5,mistral:7b,gemma:7b,0.0,0.0,0.00065,0.0,0.35923,0.00788,8.484,"Significant (TruthfulQA, HHEMRate, Medical, Le..."
6,mistral:7b,qwen2.5:3b,0.00396,0.00509,0.20745,0.01536,1.0,0.00046,2.058,"Significant (TruthfulQA, Communication Clarity)"
7,llama3:8b,gemma:7b,0.0,0.0,0.00227,0.0,0.00035,7e-05,14.526,"Significant (TruthfulQA, HHEMRate, Medical, Le..."
8,llama3:8b,qwen2.5:3b,1e-05,0.00035,0.002,0.00801,0.00695,0.2204,7.536,"Significant (TruthfulQA, HHEMRate, Medical)"
9,gemma:7b,qwen2.5:3b,0.00013,0.00042,1e-05,0.0,0.38691,0.00014,-4.304,"Significant (TruthfulQA, HHEMRate, Medical, Le..."


In [10]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy. stats import ttest_rel

# ---------------- CONFIG ----------------
csv_files = [
    "/kaggle/input/dataset/01_Model_Evaluation_Result_1 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_2 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_3 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_4 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_5 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_6 (1).csv",
]

metrics = ["TruthfulQA", "HHEMRate", "Medical", "Legal", "Scientific", "Communication Clarity"]
alpha = 0.05

# ---------------- STEP 1: Read CSVs ----------------
dfs = []
for f in csv_files:
    df = pd.read_csv(f)
    df.columns = df.columns.str.strip()  # clean column names
    # rename first column to Model if unnamed
    if df.columns[0] != "Model":
        df = df.rename(columns={df.columns[0]: "Model"})
    dfs.append(df)

# ---------------- STEP 2: Aggregate per-model per-metric arrays ----------------
# data[model][metric] = [run1, run2, ..., run6]
data = {}

for df in dfs:
    for _, row in df.iterrows():
        model = row["Model"]
        if model not in data:
            data[model] = {m: [] for m in metrics}
        for m in metrics:
            data[model][m].append(float(row[m]))

# ---------------- STEP 3: Hedges' g for paired samples ----------------
def hedges_g_paired(x, y):
    x = np.array(x, dtype=float)
    y = np.array(y, dtype=float)
    diff = x - y
    sd = diff.std(ddof=1)
    if sd == 0:
        return 0.0
    d = diff.mean() / sd
    n = len(diff)
    correction = 1 - (3 / (4*n - 1))  # small sample correction
    return d * correction

# ---------------- STEP 4: Pairwise Statistical Tests ----------------
results = []
models = list(data.keys())
pairs = list(combinations(models, 2))
bonf_alpha = alpha / len(pairs)

for m1, m2 in pairs:
    row = {
        "Model 1": m1,
        "Model 2": m2
    }
    
    sig_metrics = []
    
    for metric in metrics:
        x = np.array(data[m1][metric], dtype=float)
        y = np.array(data[m2][metric], dtype=float)
        
        t_stat, p_val = ttest_rel(x, y)
        row[f"{metric} p-value"] = f"{p_val:.2e}"  # scientific notation
        
        if p_val < bonf_alpha:
            sig_metrics.append(metric)
    
    # Hedges' g on TruthfulQA (you can compute per-metric if needed)
    g = hedges_g_paired(data[m1]["TruthfulQA"], data[m2]["TruthfulQA"])
    row["Effect Size (Hedges' g, TQ)"] = round(g, 3)
    
    if len(sig_metrics) == 0:
        row["Significance (Bonferroni)"] = "Not significant"
    else:
        row["Significance (Bonferroni)"] = "Significant (" + ", ".join(sig_metrics) + ")"
    
    results.append(row)

pairwise_df = pd.DataFrame(results)
pairwise_df

Unnamed: 0,Model 1,Model 2,TruthfulQA p-value,HHEMRate p-value,Medical p-value,Legal p-value,Scientific p-value,Communication Clarity p-value,"Effect Size (Hedges' g, TQ)",Significance (Bonferroni)
0,deepseek-llm,mistral:7b,0.0041,0.0373,0.000378,0.000775,3.85e-05,0.108,-1.775,"Significant (TruthfulQA, Medical, Legal, Scien..."
1,deepseek-llm,llama3:8b,4.24e-05,0.000205,2.36e-05,0.00129,0.000196,0.00314,-4.735,"Significant (TruthfulQA, HHEMRate, Medical, Le..."
2,deepseek-llm,gemma:7b,3.28e-06,0.000919,0.0613,0.000632,1.21e-05,0.141,7.957,"Significant (TruthfulQA, HHEMRate, Legal, Scie..."
3,deepseek-llm,qwen2.5:3b,0.0118,0.348,2.2e-06,5.18e-05,0.000717,0.000324,1.372,"Significant (Medical, Legal, Scientific, Commu..."
4,mistral:7b,llama3:8b,4.05e-05,0.000191,0.247,0.0103,0.00805,0.00552,-4.779,"Significant (TruthfulQA, HHEMRate)"
5,mistral:7b,gemma:7b,4.78e-06,1.16e-07,0.000646,7.77e-09,0.359,0.00788,7.377,"Significant (TruthfulQA, HHEMRate, Medical, Le..."
6,mistral:7b,qwen2.5:3b,0.00396,0.00509,0.207,0.0154,1.0,0.000458,1.79,"Significant (TruthfulQA, Communication Clarity)"
7,llama3:8b,gemma:7b,3.3e-07,1.46e-06,0.00227,2.83e-08,0.000347,6.98e-05,12.631,"Significant (TruthfulQA, HHEMRate, Medical, Le..."
8,llama3:8b,qwen2.5:3b,8.58e-06,0.000351,0.002,0.00801,0.00695,0.22,6.553,"Significant (TruthfulQA, HHEMRate, Medical)"
9,gemma:7b,qwen2.5:3b,0.000133,0.000424,1.14e-05,5e-06,0.387,0.000138,-3.742,"Significant (TruthfulQA, HHEMRate, Medical, Le..."


In [13]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import ttest_rel

# ---------------- CONFIG ----------------
csv_files = [
    "/kaggle/input/dataset/01_Model_Evaluation_Result_1 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_2 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_3 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_4 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_5 (1).csv",
    "/kaggle/input/dataset/01_Model_Evaluation_Result_6 (1).csv",
]

metrics = ["TruthfulQA", "HHEMRate", "Medical", "Legal", "Scientific", "Communication Clarity"]
alpha = 0.05

# ---------------- STEP 1: Load Data ----------------
dfs = []
for f in csv_files:
    df = pd.read_csv(f)
    # Fix column names
    if df.columns[0] != "Model":
        df = df.rename(columns={df.columns[0]: "Model"})
    dfs.append(df)

# ---------------- STEP 2: Aggregate Data ----------------
data = {}
for df in dfs:
    for _, row in df.iterrows():
        model = row["Model"]
        if model not in data:
            data[model] = {m: [] for m in metrics}
        for m in metrics:
            data[model][m].append(float(row[m]))

# ---------------- STEP 3: Pairwise Statistical Tests ----------------
results = []
models = list(data.keys())
pairs = list(combinations(models, 2))
bonf_alpha = alpha / len(pairs)

print(f"Bonferroni-corrected α = {bonf_alpha:.6f} ({len(pairs)} comparisons)\n")

for m1, m2 in pairs:
    row = {
        "Model 1": m1,
        "Model 2": m2
    }
    
    sig_metrics = []
    
    # Calculate p-values for each metric
    for metric in metrics:
        x = np.array(data[m1][metric])
        y = np.array(data[m2][metric])
        
        _, p_val = ttest_rel(x, y)
        
        # Format p-value
        if p_val < 0.001:
            row[f"{metric} p-value"] = "<0.001"
        else:
            row[f"{metric} p-value"] = f"{p_val:.3f}"
        
        # Check significance
        if p_val < bonf_alpha:
            sig_metrics.append(metric)
    
    # Significance summary
    if len(sig_metrics) == 0:
        row["Significance (Bonferroni)"] = "Not significant"
    else:
        # Shorten metric names for display
        abbrev = {
            "TruthfulQA": "TQ",
            "HHEMRate": "HHEM", 
            "Medical": "Med",
            "Legal": "Legal",
            "Scientific": "Sci",
            "Communication Clarity": "Comm"
        }
        sig_abbrev = [abbrev.get(m, m) for m in sig_metrics]
        row["Significance (Bonferroni)"] = "Significant (" + ", ".join(sig_abbrev) + ")"
    
    results.append(row)

# ---------------- STEP 4: Create DataFrame ----------------
pairwise_df = pd.DataFrame(results)

# Reorder columns for better readability
col_order = ["Model 1", "Model 2", "TruthfulQA p-value", "Medical p-value", 
             "Significance (Bonferroni)"]

# Add other metric p-values after
other_metrics = ["HHEMRate p-value", "Legal p-value", "Scientific p-value", 
                 "Communication Clarity p-value"]
for col in other_metrics:
    if col in pairwise_df.columns:
        col_order.insert(-1, col)  # Insert before Significance column

pairwise_df = pairwise_df[col_order]

# Display
print("="*80)
print("PAIRWISE STATISTICAL COMPARISONS")
print("="*80)
print(pairwise_df.to_string(index=False))
print("\n" + "="*80)
print(f"Note: Significance level after Bonferroni correction: α = {bonf_alpha:.6f}")
print("="*80)

# Save to CSV (optional)
pairwise_df.to_csv("supplementary_table2.csv", index=False)

pairwise_df

Bonferroni-corrected α = 0.005000 (10 comparisons)

PAIRWISE STATISTICAL COMPARISONS
     Model 1    Model 2 TruthfulQA p-value Medical p-value HHEMRate p-value Legal p-value Scientific p-value Communication Clarity p-value                     Significance (Bonferroni)
deepseek-llm mistral:7b              0.004          <0.001            0.037        <0.001             <0.001                         0.108             Significant (TQ, Med, Legal, Sci)
deepseek-llm  llama3:8b             <0.001          <0.001           <0.001         0.001             <0.001                         0.003 Significant (TQ, HHEM, Med, Legal, Sci, Comm)
deepseek-llm   gemma:7b             <0.001           0.061           <0.001        <0.001             <0.001                         0.141            Significant (TQ, HHEM, Legal, Sci)
deepseek-llm qwen2.5:3b              0.012          <0.001            0.348        <0.001             <0.001                        <0.001           Significant (Med, Legal, S

Unnamed: 0,Model 1,Model 2,TruthfulQA p-value,Medical p-value,HHEMRate p-value,Legal p-value,Scientific p-value,Communication Clarity p-value,Significance (Bonferroni)
0,deepseek-llm,mistral:7b,0.004,<0.001,0.037,<0.001,<0.001,0.108,"Significant (TQ, Med, Legal, Sci)"
1,deepseek-llm,llama3:8b,<0.001,<0.001,<0.001,0.001,<0.001,0.003,"Significant (TQ, HHEM, Med, Legal, Sci, Comm)"
2,deepseek-llm,gemma:7b,<0.001,0.061,<0.001,<0.001,<0.001,0.141,"Significant (TQ, HHEM, Legal, Sci)"
3,deepseek-llm,qwen2.5:3b,0.012,<0.001,0.348,<0.001,<0.001,<0.001,"Significant (Med, Legal, Sci, Comm)"
4,mistral:7b,llama3:8b,<0.001,0.247,<0.001,0.010,0.008,0.006,"Significant (TQ, HHEM)"
5,mistral:7b,gemma:7b,<0.001,<0.001,<0.001,<0.001,0.359,0.008,"Significant (TQ, HHEM, Med, Legal)"
6,mistral:7b,qwen2.5:3b,0.004,0.207,0.005,0.015,1.000,<0.001,"Significant (TQ, Comm)"
7,llama3:8b,gemma:7b,<0.001,0.002,<0.001,<0.001,<0.001,<0.001,"Significant (TQ, HHEM, Med, Legal, Sci, Comm)"
8,llama3:8b,qwen2.5:3b,<0.001,0.002,<0.001,0.008,0.007,0.220,"Significant (TQ, HHEM, Med)"
9,gemma:7b,qwen2.5:3b,<0.001,<0.001,<0.001,<0.001,0.387,<0.001,"Significant (TQ, HHEM, Med, Legal, Comm)"
