## Libraries

In [25]:
import pandas as pd
import json

import numpy as np

from scipy.stats import ttest_rel
from itertools import combinations

## Data Loading

In [17]:
# Read the CSV (handle missing headers)
df = pd.read_csv("CLIMABENCH.csv", delimiter=",", skip_blank_lines=True, dtype=str)

# Convert all comma-style decimals (e.g., "0,81008") to dots (e.g., "0.81008")
df = df.map(lambda x: x.replace(",", ".") if isinstance(x, str) else x)

# Fill missing MODEL names (assuming it's only missing in seed rows)
df["MODEL"].fillna(method="ffill", inplace=True)

# Drop completely empty rows
df.dropna(how="all", inplace=True)

# Extract relevant columns: model, seed, task results
models = df["MODEL"].unique().tolist()
tasks = [col for col in df.columns if col not in ["MODEL", "SEED", "TOTAL AVG"] and "AVG" not in col]

# Convert data into nested dictionary
results = {}

for _, row in df.iterrows():
    model = row["MODEL"]
    seed = row["SEED"]
    
    if model not in results:
        results[model] = {}

    for task in tasks:
        score = row[task]
        avg_col = f"AVG {task}"

        if pd.isna(score):  # Skip missing values
            continue

        score = float(score)  # Convert string to float
        
        if task not in results[model]:
            results[model][task] = {"seeds": [], "average": None}

        results[model][task]["seeds"].append(score)

        # Assign average score (only from rows where it's present)
        if pd.notna(row[avg_col]):
            results[model][task]["average"] = float(row[avg_col])

# Convert to JSON and save
json_output = json.dumps(results, indent=4)

with open("parsed_results.json", "w") as f:
    f.write(json_output)

# print(json_output)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["MODEL"].fillna(method="ffill", inplace=True)
  df["MODEL"].fillna(method="ffill", inplace=True)


In [18]:
results["CliReBERT"]["ClimateInsurance"]

{'seeds': [0.81008, 0.81912, 0.81279, 0.81765, 0.80554], 'average': 0.813036}

In [19]:
for model in results:
    print(model)

CliReBERT
CliSciBERT
SciBERT
SciClimateBERT
ClimateBERT
BERT
RoBERTa
DistilRoBERTa


In [20]:
# **ASSERTION CHECK** - Compare provided average with calculated average
inconsistencies = []

for model, tasks in results.items():
    for task, data in tasks.items():
        if data["average"] is not None and data["seeds"]:
            computed_avg = sum(data["seeds"]) / len(data["seeds"])
            if round(computed_avg, 6) != round(data["average"], 6):  # Allow small rounding differences
                inconsistencies.append({
                    "model": model,
                    "task": task,
                    "provided_average": data["average"],
                    "computed_average": computed_avg,
                    "seeds": data["seeds"]
                })

# Print inconsistencies
if inconsistencies:
    print("❌ Inconsistencies found!")
    for issue in inconsistencies:
        print(f"Model: {issue['model']}, Task: {issue['task']}")
        print(f"  - Provided Average: {issue['provided_average']}")
        print(f"  - Computed Average: {issue['computed_average']}")
        print(f"  - Seeds: {issue['seeds']}")
else:
    print("✅ All averages match the computed values!")

✅ All averages match the computed values!


## Table 1

In [21]:
print(results.keys())
print(results["CliReBERT"].keys())
print(results["CliReBERT"]["CDPCities"].keys())
print(results["CliReBERT"]["CDPCities"]["average"], results["CliReBERT"]["CDPCities"]["seeds"])

dict_keys(['CliReBERT', 'CliSciBERT', 'SciBERT', 'SciClimateBERT', 'ClimateBERT', 'BERT', 'RoBERTa', 'DistilRoBERTa'])
dict_keys(['ClimateInsurance', 'CDPCities', 'ClimateText', 'ClimateStance', 'ClimateEng', 'SciDCC', 'ClimateFEVER'])
dict_keys(['seeds', 'average'])
0.618628 [0.61758, 0.63117, 0.61601, 0.6114, 0.61698]


In [23]:
# --- 1. Flatten the nested 'results' dictionary into a list of records ---
records = []
for model, datasets in results.items():
    for dataset, metrics in datasets.items():
        seeds = metrics['seeds']
        records.append({
            "model": model,
            "dataset": dataset,
            "mean": np.mean(seeds),
            "std": np.std(seeds, ddof=1),
        })

df = pd.DataFrame(records)

# --- 1.5. Rename the specific model (NEW STEP) ---
df['model'] = df['model'].replace({'CliReBERT_clirevocab_10e_91024': 'CliReBERT'})


# --- 2. Create summary table with MultiIndex columns: ('mean', model), ('std', model) ---
summary = df.pivot(index='dataset', columns='model', values=['mean', 'std'])

# --- 3. Scale only the mean values by 100 to convert to percentages ---
summary_scaled = summary.copy()
summary_scaled['mean'] *= 100
summary_scaled['std'] *= 100

# --- 4. Compute total averages (mean and std) for each model ---
model_means_scaled = summary_scaled['mean'].T
total_avg = model_means_scaled.mean(axis=1)
total_std = summary['mean'].T.std(axis=1, ddof=1) * 100

# --- 5. Format "mean ± std" strings for each dataset/model pair ---
formatted = pd.DataFrame({
    model: summary_scaled['mean'][model].map('{:.2f}'.format) + ' ± ' + summary_scaled['std'][model].map('{:.2f}'.format)
    for model in summary_scaled['mean'].columns
}).T

formatted.index.name = "model"

# --- 6. Add total average column ---
formatted['Total Average'] = total_avg.map('{:.2f}'.format) + ' ± ' + total_std.map('{:.2f}'.format)

# --- 6.5. Reorder the models to the desired specification (NEW STEP) ---
model_order = [
    "RoBERTa",
    "DistilRoBERTa",
    "ClimateBERT",
    "SciClimateBERT",
    "BERT",
    "SciBERT",
    "CliSciBERT",
    "CliReBERT"
]
formatted = formatted.reindex(model_order)

# --- 6.6. Reorder the datasets (columns) (NEW STEP) ---
dataset_order = [
    "ClimateInsurance",
    "CDPCities",
    "ClimateText",
    "ClimateStance",
    "ClimateEng",
    "SciDCC",
    "ClimateFEVER"
]

# Build the final column order, keeping 'Total Average' at the end
# This also handles cases where a dataset might not be in the results
final_column_order = [ds for ds in dataset_order if ds in formatted.columns] + ['Total Average']

formatted = formatted[final_column_order]

# --- 7. Display or save ---
# print(formatted)
formatted.to_csv("table1.csv")
formatted

dataset,ClimateInsurance,CDPCities,ClimateText,ClimateStance,ClimateEng,SciDCC,ClimateFEVER,Total Average
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RoBERTa,79.43 ± 2.56,65.51 ± 0.31,68.62 ± 12.80,39.51 ± 9.39,32.33 ± 23.18,52.55 ± 1.02,63.76 ± 1.55,57.39 ± 16.78
DistilRoBERTa,80.21 ± 1.34,63.85 ± 0.72,73.67 ± 3.77,41.12 ± 1.75,35.32 ± 4.99,52.76 ± 0.77,61.53 ± 1.45,58.35 ± 16.39
ClimateBERT,81.95 ± 0.90,64.21 ± 0.17,73.10 ± 1.10,52.43 ± 4.20,64.51 ± 14.48,52.06 ± 0.69,62.30 ± 0.91,64.37 ± 10.67
SciClimateBERT,79.82 ± 0.95,64.63 ± 0.13,75.64 ± 1.47,40.60 ± 2.76,30.07 ± 4.06,52.56 ± 1.79,61.48 ± 1.07,57.83 ± 18.05
BERT,80.35 ± 1.58,64.42 ± 0.57,82.99 ± 2.44,42.58 ± 1.05,43.91 ± 5.43,52.70 ± 1.32,62.78 ± 1.28,61.39 ± 16.19
SciBERT,81.35 ± 0.96,64.17 ± 0.80,74.00 ± 3.59,39.70 ± 8.12,42.11 ± 15.36,51.75 ± 0.81,62.09 ± 1.32,59.31 ± 15.65
CliSciBERT,80.11 ± 0.88,65.47 ± 0.50,74.93 ± 3.35,41.90 ± 3.66,48.64 ± 2.64,51.93 ± 0.60,60.52 ± 0.76,60.50 ± 14.01
CliReBERT,81.30 ± 0.56,61.86 ± 0.74,84.19 ± 1.11,50.23 ± 2.15,66.04 ± 1.73,53.33 ± 0.68,61.17 ± 1.30,65.45 ± 12.99


## Paired t-test

In [26]:
# Significance level
alpha = 0.05

model_names = list(results.keys())
dataset_names = list(results[model_names[0]].keys())

# Store results in a list of dictionaries for easy conversion to a DataFrame
summary_results = []

# Iterate through all unique pairs of models
for model_a, model_b in combinations(model_names, 2):
    print(f"\n{'='*20} Comparing: {model_a} vs. {model_b} {'='*20}")

    for dataset in dataset_names:
        # Get the lists of scores from the 5 seeds for each model
        scores_a = results[model_a][dataset]['seeds']
        scores_b = results[model_b][dataset]['seeds']
        
        # Get the average scores for comparison
        avg_a = results[model_a][dataset]['average']
        avg_b = results[model_b][dataset]['average']

        # Perform the paired t-test
        t_statistic, p_value = ttest_rel(scores_a, scores_b)

        # Determine which model is better if the difference is significant
        winner = "None"
        is_significant = p_value < alpha
        if is_significant:
            winner = model_a if avg_a > avg_b else model_b
        
        # Store for summary table
        summary_results.append({
            "Model A": model_a,
            "Model B": model_b,
            "Dataset": dataset,
            "P-Value": p_value,
            "Significant (p < 0.05)": is_significant,
            "Winner": winner,
            f"{model_a} Avg": avg_a,
            f"{model_b} Avg": avg_b,
        })

        # --- Print detailed interpretation for each test ---
        print(f"\n--- On Dataset: {dataset} ---")
        print(f"  {model_a} average: {avg_a:.4f}")
        print(f"  {model_b} average: {avg_b:.4f}")
        print(f"  P-value: {p_value:.4f}")

        if is_significant:
            print(f"  ✅ Result: The difference is statistically significant.")
            print(f"     Winner: {winner} is significantly better on this dataset.")
        else:
            print(f"  ❌ Result: The difference is NOT statistically significant.")
            print(f"     Conclusion: We cannot claim one model is better than the other on this dataset.")


# --- Create a Summary DataFrame ---
summary_df = pd.DataFrame(summary_results)
print("\n\n" + "="*60)
print("                  STATISTICAL TEST SUMMARY TABLE")
print("="*60)
print(summary_df)



--- On Dataset: ClimateInsurance ---
  CliReBERT average: 0.8130
  CliSciBERT average: 0.8011
  P-value: 0.0664
  ❌ Result: The difference is NOT statistically significant.
     Conclusion: We cannot claim one model is better than the other on this dataset.

--- On Dataset: CDPCities ---
  CliReBERT average: 0.6186
  CliSciBERT average: 0.6547
  P-value: 0.0005
  ✅ Result: The difference is statistically significant.
     Winner: CliSciBERT is significantly better on this dataset.

--- On Dataset: ClimateText ---
  CliReBERT average: 0.8419
  CliSciBERT average: 0.7493
  P-value: 0.0049
  ✅ Result: The difference is statistically significant.
     Winner: CliReBERT is significantly better on this dataset.

--- On Dataset: ClimateStance ---
  CliReBERT average: 0.5023
  CliSciBERT average: 0.4190
  P-value: 0.0049
  ✅ Result: The difference is statistically significant.
     Winner: CliReBERT is significantly better on this dataset.

--- On Dataset: ClimateEng ---
  CliReBERT average:

In [27]:
# --- Analysis with Bonferroni Correction ---

# 1. Count total number of tests
num_model_pairs = len(list(combinations(model_names, 2)))
num_datasets = len(dataset_names)
total_tests = num_model_pairs * num_datasets
original_alpha = 0.05
bonferroni_alpha = original_alpha / total_tests

print(f"\n\n{'='*60}")
print("           ANALYSIS WITH BONFERRONI CORRECTION")
print(f"{'='*60}\n")
print(f"Original alpha: {original_alpha}")
print(f"Total number of tests: {total_tests}")
print(f"Bonferroni-corrected alpha: {bonferroni_alpha:.6f}\n")


# Re-evaluate the summary DataFrame with the new alpha
summary_df['Significant (Bonferroni)'] = summary_df['P-Value'] < bonferroni_alpha
summary_df['Winner (Bonferroni)'] = summary_df.apply(
    lambda row: row['Winner'] if row['Significant (Bonferroni)'] else 'None', 
    axis=1
)

# Display the final, corrected table
print("--- Summary Table with Bonferroni Correction ---")
print(summary_df[[
    "Model A", "Model B", "Dataset", "P-Value", "Significant (Bonferroni)", "Winner (Bonferroni)"
]])



           ANALYSIS WITH BONFERRONI CORRECTION

Original alpha: 0.05
Total number of tests: 196
Bonferroni-corrected alpha: 0.000255

--- Summary Table with Bonferroni Correction ---
       Model A        Model B           Dataset   P-Value  \
0    CliReBERT     CliSciBERT  ClimateInsurance  0.066357   
1    CliReBERT     CliSciBERT         CDPCities  0.000523   
2    CliReBERT     CliSciBERT       ClimateText  0.004932   
3    CliReBERT     CliSciBERT     ClimateStance  0.004913   
4    CliReBERT     CliSciBERT        ClimateEng  0.000148   
..         ...            ...               ...       ...   
191    RoBERTa  DistilRoBERTa       ClimateText  0.452546   
192    RoBERTa  DistilRoBERTa     ClimateStance  0.703328   
193    RoBERTa  DistilRoBERTa        ClimateEng  0.767192   
194    RoBERTa  DistilRoBERTa            SciDCC  0.577573   
195    RoBERTa  DistilRoBERTa      ClimateFEVER  0.009633   

     Significant (Bonferroni) Winner (Bonferroni)  
0                       False 

In [29]:
summary_df

Unnamed: 0,Model A,Model B,Dataset,P-Value,Significant (p < 0.05),Winner,CliReBERT Avg,CliSciBERT Avg,SciBERT Avg,SciClimateBERT Avg,ClimateBERT Avg,BERT Avg,RoBERTa Avg,DistilRoBERTa Avg,Significant (Bonferroni),Winner (Bonferroni)
0,CliReBERT,CliSciBERT,ClimateInsurance,0.066357,False,,0.813036,0.801110,,,,,,,False,
1,CliReBERT,CliSciBERT,CDPCities,0.000523,True,CliSciBERT,0.618628,0.654746,,,,,,,False,
2,CliReBERT,CliSciBERT,ClimateText,0.004932,True,CliReBERT,0.841904,0.749344,,,,,,,False,
3,CliReBERT,CliSciBERT,ClimateStance,0.004913,True,CliReBERT,0.502296,0.419006,,,,,,,False,
4,CliReBERT,CliSciBERT,ClimateEng,0.000148,True,CliReBERT,0.660430,0.486436,,,,,,,True,CliReBERT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,RoBERTa,DistilRoBERTa,ClimateText,0.452546,False,,,,,,,,0.686196,0.736690,False,
192,RoBERTa,DistilRoBERTa,ClimateStance,0.703328,False,,,,,,,,0.395082,0.411152,False,
193,RoBERTa,DistilRoBERTa,ClimateEng,0.767192,False,,,,,,,,0.323328,0.353162,False,
194,RoBERTa,DistilRoBERTa,SciDCC,0.577573,False,,,,,,,,0.525482,0.527618,False,


In [45]:
main_model = 'ClimateBERT'
other_models = [m for m in model_names if m != main_model]

results_summary_detailed = []

for other_model in other_models:
    # Find all comparisons between the two models
    comparison_df = summary_df[
        ((summary_df['Model A'] == main_model) & (summary_df['Model B'] == other_model)) |
        ((summary_df['Model A'] == other_model) & (summary_df['Model B'] == main_model))
    ]
    
    # Start with all results significant at the p < 0.05 level
    significant_comps = comparison_df[comparison_df['Significant (p < 0.05)']].copy()
    
    # If there are no differences even at the lenient level, we can skip
    if significant_comps.empty:
        wins_str = "—"
        losses_str = "—"
    else:
        # **NEW LOGIC HERE**
        # Create a new column with the dataset name, bolded if it passes Bonferroni
        # We use Markdown's **bold** syntax
        significant_comps['formatted_dataset'] = significant_comps.apply(
            lambda row: f"**{row['Dataset']}**" if row['Significant (Bonferroni)'] else row['Dataset'],
            axis=1
        )

        # Separate into wins and losses based on the 'Winner' column, using the new formatted strings
        wins = significant_comps[significant_comps['Winner'] == main_model]['formatted_dataset'].tolist()
        losses = significant_comps[significant_comps['Winner'] == other_model]['formatted_dataset'].tolist()
        
        wins_str = ', '.join(wins) if wins else "—"
        losses_str = ', '.join(losses) if losses else "—"
        
    results_summary_detailed.append({
        'Comparison': f"{main_model} vs. {other_model}",
        'Significantly Better On': wins_str,
        'Significantly Worse On': losses_str
    })

wins_losses_detailed_df = pd.DataFrame(results_summary_detailed)

# print("--- 'Wins and Losses' Summary Table (with two significance levels) ---")
print(wins_losses_detailed_df.to_markdown(index=False))

| Comparison                     | Significantly Better On                     | Significantly Worse On   |
|:-------------------------------|:--------------------------------------------|:-------------------------|
| ClimateBERT vs. CliReBERT      | CDPCities                                   | **ClimateText**          |
| ClimateBERT vs. CliSciBERT     | ClimateInsurance, ClimateStance             | CDPCities                |
| ClimateBERT vs. SciBERT        | ClimateStance, **ClimateEng**               | —                        |
| ClimateBERT vs. SciClimateBERT | ClimateInsurance, ClimateStance, ClimateEng | CDPCities                |
| ClimateBERT vs. BERT           | ClimateInsurance, ClimateStance             | ClimateText              |
| ClimateBERT vs. RoBERTa        | —                                           | CDPCities                |
| ClimateBERT vs. DistilRoBERTa  | ClimateStance, ClimateEng                   | —                        |


In [50]:
import pandas as pd

# --- Step 1: Count wins at both significance levels ---
lenient_wins = summary_df['Winner'].value_counts()
strict_wins = summary_df['Winner (Bonferroni)'].value_counts()

# --- Step 2: Create the summary DataFrame and populate with raw numbers ---
final_summary_df = pd.DataFrame(index=model_names)
final_summary_df.index.name = 'Model'

final_summary_df['lenient_wins_count'] = final_summary_df.index.map(lenient_wins).fillna(0).astype(int)
final_summary_df['strict_wins_count'] = final_summary_df.index.map(strict_wins).fillna(0).astype(int)

# --- Step 3: Calculate the total possible wins and both win rates ---
num_models = len(model_names)
num_datasets = len(dataset_names)
possible_wins_per_model = (num_models - 1) * num_datasets

final_summary_df['lenient_win_rate'] = final_summary_df['lenient_wins_count'] / possible_wins_per_model
final_summary_df['strict_win_rate'] = final_summary_df['strict_wins_count'] / possible_wins_per_model

# --- Step 4: Create the combined string columns for presentation ---
final_summary_df['Significant Wins'] = final_summary_df.apply(
    lambda row: f"{row['lenient_wins_count']} ({row['strict_wins_count']}) / {possible_wins_per_model}",
    axis=1
)

final_summary_df['Win Rate'] = final_summary_df.apply(
    lambda row: f"{row['lenient_win_rate']:.2%} ({row['strict_win_rate']:.2%})",
    axis=1
)

# --- Step 5: Sort by the most important metric (strict wins) and select final columns ---
# THIS IS THE UPDATED LINE:
# Sort by strict wins first, then use lenient wins as a tie-breaker.
final_summary_df = final_summary_df.sort_values(
    by=['strict_wins_count', 'lenient_wins_count'],
    ascending=[False, False]
)

# Select only the beautifully formatted columns for the final display.
display_df = final_summary_df[['Significant Wins', 'Win Rate']]

# --- Display the final table in a clean format ---
# print(f"Total number of models: {num_models}")
# print(f"Total number of datasets: {num_datasets}")
# print(f"Total possible wins per model = ({num_models} - 1) * {num_datasets} = {possible_wins_per_model}\n")

# print("--- Final Model Leaderboard (Multi-level Sort) ---")
print(display_df.to_markdown())

| Model          | Significant Wins   | Win Rate       |
|:---------------|:-------------------|:---------------|
| CliReBERT      | 18.0 (4.0) / 49    | 36.73% (8.16%) |
| ClimateBERT    | 12.0 (1.0) / 49    | 24.49% (2.04%) |
| BERT           | 10.0 (0.0) / 49    | 20.41% (0.00%) |
| RoBERTa        | 9.0 (0.0) / 49     | 18.37% (0.00%) |
| CliSciBERT     | 7.0 (0.0) / 49     | 14.29% (0.00%) |
| SciClimateBERT | 2.0 (0.0) / 49     | 4.08% (0.00%)  |
| DistilRoBERTa  | 2.0 (0.0) / 49     | 4.08% (0.00%)  |
| SciBERT        | 1.0 (0.0) / 49     | 2.04% (0.00%)  |
