## Libraries

In [1]:
import pandas as pd
import json

import numpy as np

from scipy.stats import ttest_rel
from itertools import combinations

from scipy.stats import wilcoxon  

  from pandas.core import (


## Data Loading

In [2]:
# Read the CSV (handle missing headers)
df = pd.read_csv("CLIMABENCH.csv", delimiter=",", skip_blank_lines=True, dtype=str)

# Convert all comma-style decimals (e.g., "0,81008") to dots (e.g., "0.81008")
df = df.map(lambda x: x.replace(",", ".") if isinstance(x, str) else x)

# Fill missing MODEL names (assuming it's only missing in seed rows)
df["MODEL"].fillna(method="ffill", inplace=True)

# Drop completely empty rows
df.dropna(how="all", inplace=True)

# Extract relevant columns: model, seed, task results
models = df["MODEL"].unique().tolist()
tasks = [col for col in df.columns if col not in ["MODEL", "SEED", "TOTAL AVG"] and "AVG" not in col]

# Convert data into nested dictionary
results = {}

for _, row in df.iterrows():
    model = row["MODEL"]
    seed = row["SEED"]
    
    if model not in results:
        results[model] = {}

    for task in tasks:
        score = row[task]
        avg_col = f"AVG {task}"

        if pd.isna(score):  # Skip missing values
            continue

        score = float(score)  # Convert string to float
        
        if task not in results[model]:
            results[model][task] = {"seeds": [], "average": None}

        results[model][task]["seeds"].append(score)

        # Assign average score (only from rows where it's present)
        if pd.notna(row[avg_col]):
            results[model][task]["average"] = float(row[avg_col])

# Convert to JSON and save
json_output = json.dumps(results, indent=4)

with open("parsed_results.json", "w") as f:
    f.write(json_output)

# print(json_output)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["MODEL"].fillna(method="ffill", inplace=True)
  df["MODEL"].fillna(method="ffill", inplace=True)


In [3]:
results["CliReBERT"]["ClimateInsurance"]

{'seeds': [0.81008, 0.81912, 0.81279, 0.81765, 0.80554], 'average': 0.813036}

In [4]:
for model in results:
    print(model)

CliReBERT
CliSciBERT
SciBERT
SciClimateBERT
ClimateBERT
BERT
RoBERTa
DistilRoBERTa


In [5]:
# **ASSERTION CHECK** - Compare provided average with calculated average
inconsistencies = []

for model, tasks in results.items():
    for task, data in tasks.items():
        if data["average"] is not None and data["seeds"]:
            computed_avg = sum(data["seeds"]) / len(data["seeds"])
            if round(computed_avg, 6) != round(data["average"], 6):  # Allow small rounding differences
                inconsistencies.append({
                    "model": model,
                    "task": task,
                    "provided_average": data["average"],
                    "computed_average": computed_avg,
                    "seeds": data["seeds"]
                })

# Print inconsistencies
if inconsistencies:
    print("❌ Inconsistencies found!")
    for issue in inconsistencies:
        print(f"Model: {issue['model']}, Task: {issue['task']}")
        print(f"  - Provided Average: {issue['provided_average']}")
        print(f"  - Computed Average: {issue['computed_average']}")
        print(f"  - Seeds: {issue['seeds']}")
else:
    print("✅ All averages match the computed values!")

✅ All averages match the computed values!


## Table 1

In [6]:
print(results.keys())
print(results["CliReBERT"].keys())
print(results["CliReBERT"]["CDPCities"].keys())
print(results["CliReBERT"]["CDPCities"]["average"], results["CliReBERT"]["CDPCities"]["seeds"])

dict_keys(['CliReBERT', 'CliSciBERT', 'SciBERT', 'SciClimateBERT', 'ClimateBERT', 'BERT', 'RoBERTa', 'DistilRoBERTa'])
dict_keys(['ClimateInsurance', 'CDPCities', 'ClimateText', 'ClimateStance', 'ClimateEng', 'SciDCC', 'ClimateFEVER'])
dict_keys(['seeds', 'average'])
0.618628 [0.61758, 0.63117, 0.61601, 0.6114, 0.61698]


In [7]:
# --- 1. Flatten the nested 'results' dictionary into a list of records ---
records = []
for model, datasets in results.items():
    for dataset, metrics in datasets.items():
        seeds = metrics['seeds']
        records.append({
            "model": model,
            "dataset": dataset,
            "mean": np.mean(seeds),
            "std": np.std(seeds, ddof=1),
        })

df = pd.DataFrame(records)

# --- 1.5. Rename the specific model (NEW STEP) ---
df['model'] = df['model'].replace({'CliReBERT_clirevocab_10e_91024': 'CliReBERT'})


# --- 2. Create summary table with MultiIndex columns: ('mean', model), ('std', model) ---
summary = df.pivot(index='dataset', columns='model', values=['mean', 'std'])

# --- 3. Scale only the mean values by 100 to convert to percentages ---
summary_scaled = summary.copy()
summary_scaled['mean'] *= 100
summary_scaled['std'] *= 100

# --- 4. Compute total averages (mean and std) for each model ---
model_means_scaled = summary_scaled['mean'].T
total_avg = model_means_scaled.mean(axis=1)
total_std = summary['mean'].T.std(axis=1, ddof=1) * 100

# --- 5. Format "mean ± std" strings for each dataset/model pair ---
formatted = pd.DataFrame({
    model: summary_scaled['mean'][model].map('{:.2f}'.format) + ' ± ' + summary_scaled['std'][model].map('{:.2f}'.format)
    for model in summary_scaled['mean'].columns
}).T

formatted.index.name = "model"

# --- 6. Add total average column ---
formatted['Total Average'] = total_avg.map('{:.2f}'.format) + ' ± ' + total_std.map('{:.2f}'.format)

# --- 6.5. Reorder the models to the desired specification (NEW STEP) ---
model_order = [
    "RoBERTa",
    "DistilRoBERTa",
    "ClimateBERT",
    "SciClimateBERT",
    "BERT",
    "SciBERT",
    "CliSciBERT",
    "CliReBERT"
]
formatted = formatted.reindex(model_order)

# --- 6.6. Reorder the datasets (columns) (NEW STEP) ---
dataset_order = [
    "ClimateInsurance",
    "CDPCities",
    "ClimateText",
    "ClimateStance",
    "ClimateEng",
    "SciDCC",
    "ClimateFEVER"
]

# Build the final column order, keeping 'Total Average' at the end
# This also handles cases where a dataset might not be in the results
final_column_order = [ds for ds in dataset_order if ds in formatted.columns] + ['Total Average']

formatted = formatted[final_column_order]

# --- 7. Display or save ---
# print(formatted)
formatted.to_csv("table1.csv")
formatted

dataset,ClimateInsurance,CDPCities,ClimateText,ClimateStance,ClimateEng,SciDCC,ClimateFEVER,Total Average
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RoBERTa,79.43 ± 2.56,65.51 ± 0.31,68.62 ± 12.80,39.51 ± 9.39,32.33 ± 23.18,52.55 ± 1.02,63.76 ± 1.55,57.39 ± 16.78
DistilRoBERTa,80.21 ± 1.34,63.85 ± 0.72,73.67 ± 3.77,41.12 ± 1.75,35.32 ± 4.99,52.76 ± 0.77,61.53 ± 1.45,58.35 ± 16.39
ClimateBERT,81.95 ± 0.90,64.21 ± 0.17,73.10 ± 1.10,52.43 ± 4.20,64.51 ± 14.48,52.06 ± 0.69,62.30 ± 0.91,64.37 ± 10.67
SciClimateBERT,79.82 ± 0.95,64.63 ± 0.13,75.64 ± 1.47,40.60 ± 2.76,30.07 ± 4.06,52.56 ± 1.79,61.48 ± 1.07,57.83 ± 18.05
BERT,80.35 ± 1.58,64.42 ± 0.57,82.99 ± 2.44,42.58 ± 1.05,43.91 ± 5.43,52.70 ± 1.32,62.78 ± 1.28,61.39 ± 16.19
SciBERT,81.35 ± 0.96,64.17 ± 0.80,74.00 ± 3.59,39.70 ± 8.12,42.11 ± 15.36,51.75 ± 0.81,62.09 ± 1.32,59.31 ± 15.65
CliSciBERT,80.11 ± 0.88,65.47 ± 0.50,74.93 ± 3.35,41.90 ± 3.66,48.64 ± 2.64,51.93 ± 0.60,60.52 ± 0.76,60.50 ± 14.01
CliReBERT,81.30 ± 0.56,61.86 ± 0.74,84.19 ± 1.11,50.23 ± 2.15,66.04 ± 1.73,53.33 ± 0.68,61.17 ± 1.30,65.45 ± 12.99


## Paired t-test

In [8]:
# Significance level
alpha = 0.05

model_names = list(results.keys())
dataset_names = list(results[model_names[0]].keys())

# Store results in a list of dictionaries for easy conversion to a DataFrame
summary_results = []

# Iterate through all unique pairs of models
for model_a, model_b in combinations(model_names, 2):
    print(f"\n{'='*20} Comparing: {model_a} vs. {model_b} {'='*20}")

    for dataset in dataset_names:
        # Get the lists of scores from the 5 seeds for each model
        scores_a = results[model_a][dataset]['seeds']
        scores_b = results[model_b][dataset]['seeds']
        
        # Get the average scores for comparison
        avg_a = results[model_a][dataset]['average']
        avg_b = results[model_b][dataset]['average']

        # Perform the paired t-test
        t_statistic, p_value = ttest_rel(scores_a, scores_b)
        print(t_statistic)
        # Determine which model is better if the difference is significant
        winner = "None"
        is_significant = p_value < alpha
        if is_significant:
            winner = model_a if avg_a > avg_b else model_b
        
        # Store for summary table
        summary_results.append({
            "Model A": model_a,
            "Model B": model_b,
            "Dataset": dataset,
            "P-Value": p_value,
            "Significant (p < 0.05)": is_significant,
            "Winner": winner,
            f"{model_a} Avg": avg_a,
            f"{model_b} Avg": avg_b,
        })

        # --- Print detailed interpretation for each test ---
        print(f"\n--- On Dataset: {dataset} ---")
        print(f"  {model_a} average: {avg_a:.4f}")
        print(f"  {model_b} average: {avg_b:.4f}")
        print(f"  P-value: {p_value:.4f}")

        if is_significant:
            print(f"  ✅ Result: The difference is statistically significant.")
            print(f"     Winner: {winner} is significantly better on this dataset.")
        else:
            print(f"  ❌ Result: The difference is NOT statistically significant.")
            print(f"     Conclusion: We cannot claim one model is better than the other on this dataset.")


# --- Create a Summary DataFrame ---
summary_df = pd.DataFrame(summary_results)
print("\n\n" + "="*60)
print("                  STATISTICAL TEST SUMMARY TABLE")
print("="*60)
print(summary_df)


2.505754939658617

--- On Dataset: ClimateInsurance ---
  CliReBERT average: 0.8130
  CliSciBERT average: 0.8011
  P-value: 0.0664
  ❌ Result: The difference is NOT statistically significant.
     Conclusion: We cannot claim one model is better than the other on this dataset.
-10.187539084031885

--- On Dataset: CDPCities ---
  CliReBERT average: 0.6186
  CliSciBERT average: 0.6547
  P-value: 0.0005
  ✅ Result: The difference is statistically significant.
     Winner: CliSciBERT is significantly better on this dataset.
5.618819348581148

--- On Dataset: ClimateText ---
  CliReBERT average: 0.8419
  CliSciBERT average: 0.7493
  P-value: 0.0049
  ✅ Result: The difference is statistically significant.
     Winner: CliReBERT is significantly better on this dataset.
5.6246847124502

--- On Dataset: ClimateStance ---
  CliReBERT average: 0.5023
  CliSciBERT average: 0.4190
  P-value: 0.0049
  ✅ Result: The difference is statistically significant.
     Winner: CliReBERT is significantly bett

In [9]:
# --- Analysis with Bonferroni Correction ---

# 1. Count total number of tests
num_model_pairs = len(list(combinations(model_names, 2)))
num_datasets = len(dataset_names)
total_tests = num_model_pairs * num_datasets
original_alpha = 0.05
bonferroni_alpha = original_alpha / total_tests

print(f"\n\n{'='*60}")
print("           ANALYSIS WITH BONFERRONI CORRECTION")
print(f"{'='*60}\n")
print(f"Original alpha: {original_alpha}")
print(f"Total number of tests: {total_tests}")
print(f"Bonferroni-corrected alpha: {bonferroni_alpha:.6f}\n")


# Re-evaluate the summary DataFrame with the new alpha
summary_df['Significant (Bonferroni)'] = summary_df['P-Value'] < bonferroni_alpha
summary_df['Winner (Bonferroni)'] = summary_df.apply(
    lambda row: row['Winner'] if row['Significant (Bonferroni)'] else 'None', 
    axis=1
)

# Display the final, corrected table
print("--- Summary Table with Bonferroni Correction ---")
print(summary_df[[
    "Model A", "Model B", "Dataset", "P-Value", "Significant (Bonferroni)", "Winner (Bonferroni)"
]])



           ANALYSIS WITH BONFERRONI CORRECTION

Original alpha: 0.05
Total number of tests: 196
Bonferroni-corrected alpha: 0.000255

--- Summary Table with Bonferroni Correction ---
       Model A        Model B           Dataset   P-Value  \
0    CliReBERT     CliSciBERT  ClimateInsurance  0.066357   
1    CliReBERT     CliSciBERT         CDPCities  0.000523   
2    CliReBERT     CliSciBERT       ClimateText  0.004932   
3    CliReBERT     CliSciBERT     ClimateStance  0.004913   
4    CliReBERT     CliSciBERT        ClimateEng  0.000148   
..         ...            ...               ...       ...   
191    RoBERTa  DistilRoBERTa       ClimateText  0.452546   
192    RoBERTa  DistilRoBERTa     ClimateStance  0.703328   
193    RoBERTa  DistilRoBERTa        ClimateEng  0.767192   
194    RoBERTa  DistilRoBERTa            SciDCC  0.577573   
195    RoBERTa  DistilRoBERTa      ClimateFEVER  0.009633   

     Significant (Bonferroni) Winner (Bonferroni)  
0                       False 

In [10]:
summary_df

Unnamed: 0,Model A,Model B,Dataset,P-Value,Significant (p < 0.05),Winner,CliReBERT Avg,CliSciBERT Avg,SciBERT Avg,SciClimateBERT Avg,ClimateBERT Avg,BERT Avg,RoBERTa Avg,DistilRoBERTa Avg,Significant (Bonferroni),Winner (Bonferroni)
0,CliReBERT,CliSciBERT,ClimateInsurance,0.066357,False,,0.813036,0.801110,,,,,,,False,
1,CliReBERT,CliSciBERT,CDPCities,0.000523,True,CliSciBERT,0.618628,0.654746,,,,,,,False,
2,CliReBERT,CliSciBERT,ClimateText,0.004932,True,CliReBERT,0.841904,0.749344,,,,,,,False,
3,CliReBERT,CliSciBERT,ClimateStance,0.004913,True,CliReBERT,0.502296,0.419006,,,,,,,False,
4,CliReBERT,CliSciBERT,ClimateEng,0.000148,True,CliReBERT,0.660430,0.486436,,,,,,,True,CliReBERT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,RoBERTa,DistilRoBERTa,ClimateText,0.452546,False,,,,,,,,0.686196,0.736690,False,
192,RoBERTa,DistilRoBERTa,ClimateStance,0.703328,False,,,,,,,,0.395082,0.411152,False,
193,RoBERTa,DistilRoBERTa,ClimateEng,0.767192,False,,,,,,,,0.323328,0.353162,False,
194,RoBERTa,DistilRoBERTa,SciDCC,0.577573,False,,,,,,,,0.525482,0.527618,False,


In [18]:
main_model = 'CliReBERT'
other_models = [m for m in model_names if m != main_model]

results_summary_detailed = []

for other_model in other_models:
    # Find all comparisons between the two models
    comparison_df = summary_df[
        ((summary_df['Model A'] == main_model) & (summary_df['Model B'] == other_model)) |
        ((summary_df['Model A'] == other_model) & (summary_df['Model B'] == main_model))
    ]
    
    # Start with all results significant at the p < 0.05 level
    significant_comps = comparison_df[comparison_df['Significant (p < 0.05)']].copy()
    
    # If there are no differences even at the lenient level, we can skip
    if significant_comps.empty:
        wins_str = "—"
        losses_str = "—"
    else:
        # **NEW LOGIC HERE**
        # Create a new column with the dataset name, bolded if it passes Bonferroni
        # We use Markdown's **bold** syntax
        significant_comps['formatted_dataset'] = significant_comps.apply(
            lambda row: f"**{row['Dataset']}**" if row['Significant (Bonferroni)'] else row['Dataset'],
            axis=1
        )

        # Separate into wins and losses based on the 'Winner' column, using the new formatted strings
        wins = significant_comps[significant_comps['Winner'] == main_model]['formatted_dataset'].tolist()
        losses = significant_comps[significant_comps['Winner'] == other_model]['formatted_dataset'].tolist()
        
        wins_str = ', '.join(wins) if wins else "—"
        losses_str = ', '.join(losses) if losses else "—"
        
    results_summary_detailed.append({
        'Comparison': f"{main_model} vs. {other_model}",
        'Significantly Better On': wins_str,
        'Significantly Worse On': losses_str
    })

wins_losses_detailed_df = pd.DataFrame(results_summary_detailed)

# print("--- 'Wins and Losses' Summary Table (with two significance levels) ---")
print(wins_losses_detailed_df.to_markdown(index=False))

| Comparison                   | Significantly Better On                                      | Significantly Worse On   |
|:-----------------------------|:-------------------------------------------------------------|:-------------------------|
| CliReBERT vs. CliSciBERT     | ClimateText, ClimateStance, **ClimateEng**, SciDCC           | CDPCities                |
| CliReBERT vs. SciBERT        | ClimateText, ClimateEng, SciDCC                              | CDPCities                |
| CliReBERT vs. SciClimateBERT | ClimateInsurance, ClimateText, ClimateStance, **ClimateEng** | CDPCities                |
| CliReBERT vs. ClimateBERT    | **ClimateText**                                              | CDPCities                |
| CliReBERT vs. BERT           | ClimateStance, ClimateEng                                    | CDPCities                |
| CliReBERT vs. RoBERTa        | ClimateEng                                                   | CDPCities, ClimateFEVER  |
| CliReBERT vs. 

In [12]:
# --- Step 1: Count wins at both significance levels ---
lenient_wins = summary_df['Winner'].value_counts()
strict_wins = summary_df['Winner (Bonferroni)'].value_counts()

# --- Step 2: Create the summary DataFrame and populate with raw numbers ---
final_summary_df = pd.DataFrame(index=model_names)
final_summary_df.index.name = 'Model'

final_summary_df['lenient_wins_count'] = final_summary_df.index.map(lenient_wins).fillna(0).astype(int)
final_summary_df['strict_wins_count'] = final_summary_df.index.map(strict_wins).fillna(0).astype(int)

# --- Step 3: Calculate the total possible wins and both win rates ---
num_models = len(model_names)
num_datasets = len(dataset_names)
possible_wins_per_model = (num_models - 1) * num_datasets

final_summary_df['lenient_win_rate'] = final_summary_df['lenient_wins_count'] / possible_wins_per_model
final_summary_df['strict_win_rate'] = final_summary_df['strict_wins_count'] / possible_wins_per_model

# --- Step 4: Create the combined string columns for presentation ---
final_summary_df['Significant Wins'] = final_summary_df.apply(
    lambda row: f"{row['lenient_wins_count']} ({row['strict_wins_count']}) / {possible_wins_per_model}",
    axis=1
)

final_summary_df['Win Rate'] = final_summary_df.apply(
    lambda row: f"{row['lenient_win_rate']:.2%} ({row['strict_win_rate']:.2%})",
    axis=1
)

# --- Step 5: Sort by the most important metric (strict wins) and select final columns ---
# THIS IS THE UPDATED LINE:
# Sort by strict wins first, then use lenient wins as a tie-breaker.
final_summary_df = final_summary_df.sort_values(
    by=['strict_wins_count', 'lenient_wins_count'],
    ascending=[False, False]
)

# Select only the beautifully formatted columns for the final display.
display_df = final_summary_df[['Significant Wins', 'Win Rate']]

# --- Display the final table in a clean format ---
# print(f"Total number of models: {num_models}")
# print(f"Total number of datasets: {num_datasets}")
# print(f"Total possible wins per model = ({num_models} - 1) * {num_datasets} = {possible_wins_per_model}\n")

# print("--- Final Model Leaderboard (Multi-level Sort) ---")
print(display_df.to_markdown())

| Model          | Significant Wins   | Win Rate       |
|:---------------|:-------------------|:---------------|
| CliReBERT      | 18.0 (4.0) / 49    | 36.73% (8.16%) |
| ClimateBERT    | 12.0 (1.0) / 49    | 24.49% (2.04%) |
| BERT           | 10.0 (0.0) / 49    | 20.41% (0.00%) |
| RoBERTa        | 9.0 (0.0) / 49     | 18.37% (0.00%) |
| CliSciBERT     | 7.0 (0.0) / 49     | 14.29% (0.00%) |
| SciClimateBERT | 2.0 (0.0) / 49     | 4.08% (0.00%)  |
| DistilRoBERTa  | 2.0 (0.0) / 49     | 4.08% (0.00%)  |
| SciBERT        | 1.0 (0.0) / 49     | 2.04% (0.00%)  |


## Wilcoxon signed-rank test

In [73]:
# Significance level and Bonferroni correction remain the same
alpha = 0.1
total_tests = len(list(combinations(model_names, 2))) * len(dataset_names)
bonferroni_alpha = alpha / total_tests

summary_results = []

# Iterate through all unique pairs of models
for model_a, model_b in combinations(model_names, 2):
    for dataset in dataset_names:
        scores_a = results[model_a][dataset]['seeds']
        scores_b = results[model_b][dataset]['seeds']
        
        avg_a = results[model_a][dataset]['average']
        avg_b = results[model_b][dataset]['average']

        if scores_a == scores_b:
             # If scores are identical, p-value is 1.0 and there's no difference.
            stat, p_value = 0, 1.0
        else:
            stat, p_value = wilcoxon(scores_a, scores_b, zero_method='pratt')
        # -----------------------------------------

        winner = "None"
        winner_bonferroni = "None"
        
        is_significant_lenient = p_value < alpha
        if is_significant_lenient:
            winner = model_a if avg_a > avg_b else model_b
            
        is_significant_bonferroni = p_value < bonferroni_alpha
        if is_significant_bonferroni:
            winner_bonferroni = model_a if avg_a > avg_b else model_b
        
        summary_results.append({
            "Model A": model_a,
            "Model B": model_b,
            "Dataset": dataset,
            "P-Value": p_value,
            "Significant (p < 0.05)": is_significant_lenient,
            "Winner": winner,
            "Significant (Bonferroni)": is_significant_bonferroni,
            "Winner (Bonferroni)": winner_bonferroni,
        })

# Create the new summary_df. This is now based on Wilcoxon p-values.
summary_df = pd.DataFrame(summary_results)

print("summary_df successfully created using Wilcoxon signed-rank test.")
summary_df

summary_df successfully created using Wilcoxon signed-rank test.




Unnamed: 0,Model A,Model B,Dataset,P-Value,Significant (p < 0.05),Winner,Significant (Bonferroni),Winner (Bonferroni)
0,CliReBERT,CliSciBERT,ClimateInsurance,0.0625,True,CliReBERT,False,
1,CliReBERT,CliSciBERT,CDPCities,0.0625,True,CliSciBERT,False,
2,CliReBERT,CliSciBERT,ClimateText,0.0625,True,CliReBERT,False,
3,CliReBERT,CliSciBERT,ClimateStance,0.0625,True,CliReBERT,False,
4,CliReBERT,CliSciBERT,ClimateEng,0.0625,True,CliReBERT,False,
...,...,...,...,...,...,...,...,...
191,RoBERTa,DistilRoBERTa,ClimateText,0.6250,False,,False,
192,RoBERTa,DistilRoBERTa,ClimateStance,0.6250,False,,False,
193,RoBERTa,DistilRoBERTa,ClimateEng,0.6250,False,,False,
194,RoBERTa,DistilRoBERTa,SciDCC,0.6250,False,,False,


## CD-Diagram

In [13]:
# Author: Hassan Ismail Fawaz <hassan.ismail-fawaz@uha.fr>
#         Germain Forestier <germain.forestier@uha.fr>
#         Jonathan Weber <jonathan.weber@uha.fr>
#         Lhassane Idoumghar <lhassane.idoumghar@uha.fr>
#         Pierre-Alain Muller <pierre-alain.muller@uha.fr>
# License: GPL3
STAT_SIG = 1
import numpy as np
import pandas as pd
import matplotlib

matplotlib.use('agg')
import matplotlib.pyplot as plt

matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.sans-serif'] = 'Arial'

import operator
import math
from scipy.stats import wilcoxon
from scipy.stats import friedmanchisquare
import networkx

# inspired from orange3 https://docs.orange.biolab.si/3/data-mining-library/reference/evaluation.cd.html
def graph_ranks(avranks, names, p_values, cd=None, cdmethod=None, lowv=None, highv=None,
                width=6, textspace=1, reverse=False, filename=None, labels=False, **kwargs):
    """
    Draws a CD graph, which is used to display  the differences in methods'
    performance. See Janez Demsar, Statistical Comparisons of Classifiers over
    Multiple Data Sets, 7(Jan):1--30, 2006.

    Needs matplotlib to work.

    The image is ploted on `plt` imported using
    `import matplotlib.pyplot as plt`.

    Args:
        avranks (list of float): average ranks of methods.
        names (list of str): names of methods.
        cd (float): Critical difference used for statistically significance of
            difference between methods.
        cdmethod (int, optional): the method that is compared with other methods
            If omitted, show pairwise comparison of methods
        lowv (int, optional): the lowest shown rank
        highv (int, optional): the highest shown rank
        width (int, optional): default width in inches (default: 6)
        textspace (int, optional): space on figure sides (in inches) for the
            method names (default: 1)
        reverse (bool, optional):  if set to `True`, the lowest rank is on the
            right (default: `False`)
        filename (str, optional): output file name (with extension). If not
            given, the function does not write a file.
        labels (bool, optional): if set to `True`, the calculated avg rank
        values will be displayed
    """
    try:
        import matplotlib
        import matplotlib.pyplot as plt
        from matplotlib.backends.backend_agg import FigureCanvasAgg
    except ImportError:
        raise ImportError("Function graph_ranks requires matplotlib.")

    width = float(width)
    textspace = float(textspace)

    def nth(l, n):
        """
        Returns only nth elemnt in a list.
        """
        n = lloc(l, n)
        return [a[n] for a in l]

    def lloc(l, n):
        """
        List location in list of list structure.
        Enable the use of negative locations:
        -1 is the last element, -2 second last...
        """
        if n < 0:
            return len(l[0]) + n
        else:
            return n

    def mxrange(lr):
        """
        Multiple xranges. Can be used to traverse matrices.
        This function is very slow due to unknown number of
        parameters.

        >>> mxrange([3,5])
        [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]

        >>> mxrange([[3,5,1],[9,0,-3]])
        [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]

        """
        if not len(lr):
            yield ()
        else:
            # it can work with single numbers
            index = lr[0]
            if isinstance(index, int):
                index = [index]
            for a in range(*index):
                for b in mxrange(lr[1:]):
                    yield tuple([a] + list(b))

    def print_figure(fig, *args, **kwargs):
        canvas = FigureCanvasAgg(fig)
        canvas.print_figure(*args, **kwargs)

    sums = avranks

    nnames = names
    ssums = sums

    if lowv is None:
        lowv = min(1, int(math.floor(min(ssums))))
    if highv is None:
        highv = max(len(avranks), int(math.ceil(max(ssums))))

    cline = 0.4

    k = len(sums)

    lines = None

    linesblank = 0
    scalewidth = width - 2 * textspace

    def rankpos(rank):
        if not reverse:
            a = rank - lowv
        else:
            a = highv - rank
        return textspace + scalewidth / (highv - lowv) * a

    distanceh = 0.25

    cline += distanceh

    # calculate height needed height of an image
    minnotsignificant = max(2 * 0.2, linesblank)
    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant

    fig = plt.figure(figsize=(width, height))
    fig.set_facecolor('white')
    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis
    ax.set_axis_off()

    hf = 1. / height  # height factor
    wf = 1. / width

    def hfl(l):
        return [a * hf for a in l]

    def wfl(l):
        return [a * wf for a in l]

    # Upper left corner is (0,0).
    ax.plot([0, 1], [0, 1], c="w")
    ax.set_xlim(0, 1)
    ax.set_ylim(1, 0)

    def line(l, color='k', **kwargs):
        """
        Input is a list of pairs of points.
        """
        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)

    def text(x, y, s, *args, **kwargs):
        ax.text(wf * x, hf * y, s, *args, **kwargs)

    line([(textspace, cline), (width - textspace, cline)], linewidth=2)

    bigtick = 0.3
    smalltick = 0.15
    linewidth = 2.0
    linewidth_sign = 4.0

    tick = None
    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:
        tick = smalltick
        if a == int(a):
            tick = bigtick
        line([(rankpos(a), cline - tick / 2),
              (rankpos(a), cline)],
             linewidth=2)

    for a in range(lowv, highv + 1):
        text(rankpos(a), cline - tick / 2 - 0.05, str(a),
             ha="center", va="bottom", size=16)

    k = len(ssums)

    def filter_names(name):
        return name

    space_between_names = 0.24

    for i in range(math.ceil(k / 2)):
        chei = cline + minnotsignificant + i * space_between_names
        line([(rankpos(ssums[i]), cline),
              (rankpos(ssums[i]), chei),
              (textspace - 0.1, chei)],
             linewidth=linewidth)
        if labels:
            text(textspace + 0.3, chei - 0.075, format(ssums[i], '.4f'), ha="right", va="center", size=10)
        text(textspace - 0.2, chei, filter_names(nnames[i]), ha="right", va="center", size=16)

    for i in range(math.ceil(k / 2), k):
        chei = cline + minnotsignificant + (k - i - 1) * space_between_names
        line([(rankpos(ssums[i]), cline),
              (rankpos(ssums[i]), chei),
              (textspace + scalewidth + 0.1, chei)],
             linewidth=linewidth)
        if labels:
            text(textspace + scalewidth - 0.3, chei - 0.075, format(ssums[i], '.4f'), ha="left", va="center", size=10)
        text(textspace + scalewidth + 0.2, chei, filter_names(nnames[i]),
             ha="left", va="center", size=16)

    # no-significance lines
    def draw_lines(lines, side=0.05, height=0.1):
        start = cline + 0.2

        for l, r in lines:
            line([(rankpos(ssums[l]) - side, start),
                  (rankpos(ssums[r]) + side, start)],
                 linewidth=linewidth_sign)
            start += height
            print('drawing: ', l, r)

    # draw_lines(lines)
    start = cline + 0.2
    side = -0.02
    height = 0.1

    # draw no significant lines
    # get the cliques
    cliques = form_cliques(p_values, nnames)
    i = 1
    achieved_half = False
    print(nnames)
    for clq in cliques:
        if len(clq) == 1:
            continue
        print(clq)
        min_idx = np.array(clq).min()
        max_idx = np.array(clq).max()
        if min_idx >= len(nnames) / 2 and achieved_half == False:
            start = cline + 0.25
            achieved_half = True
        line([(rankpos(ssums[min_idx]) - side, start),
              (rankpos(ssums[max_idx]) + side, start)],
             linewidth=linewidth_sign)
        start += height


def form_cliques(p_values, nnames):
    """
    This method forms the cliques
    """
    # first form the numpy matrix data
    m = len(nnames)
    g_data = np.zeros((m, m), dtype=np.int64)
    for p in p_values:
        if p[3] == False:
            i = np.where(nnames == p[0])[0][0]
            j = np.where(nnames == p[1])[0][0]
            min_i = min(i, j)
            max_j = max(i, j)
            g_data[min_i, max_j] = 1

    g = networkx.Graph(g_data)
    return networkx.find_cliques(g)


def draw_cd_diagram(df_perf=None, alpha=0.05, title=None, labels=False):
    """
    Draws the critical difference diagram given the list of pairwise classifiers that are
    significant or not
    """
    p_values, average_ranks, _ = wilcoxon_holm(df_perf=df_perf, alpha=alpha)

    print(average_ranks)

    for p in p_values:
        print(p)


    graph_ranks(average_ranks.values, average_ranks.keys(), p_values,
                cd=None, reverse=True, width=9, textspace=1.5, labels=labels)

    font = {'family': 'serif',
        'color':  'black',
        'weight': 'normal',
        'size': 10,
        }
    sets_list = df_perf["dataset_name"].unique().tolist()
    sets_list.sort
    sets = "_".join(sets_list)
    if title:
        # plt.title(title+"-"+sets,fontdict=font, y=0.9, x=0.5)
        plt.title("Critical Difference Diagram", size=18)
    # sets = df_perf.unique()
    
    plt.savefig(f'./ALL/{len(sets_list)}_{STAT_SIG}_cd_diagram_{sets}.png',bbox_inches='tight')

def wilcoxon_holm(alpha=0.05, df_perf=None):
    """
    Applies the wilcoxon signed rank test between each pair of algorithm and then use Holm
    to reject the null's hypothesis
    """
    global STAT_SIG
    STAT_SIG = 1
    print(pd.unique(df_perf['model_name']))
    # count the number of tested datasets per classifier
    df_counts = pd.DataFrame({'count': df_perf.groupby(
        ['model_name']).size()}).reset_index()
    # get the maximum number of tested datasets
    max_nb_datasets = df_counts['count'].max()
    # get the list of classifiers who have been tested on nb_max_datasets
    classifiers = list(df_counts.loc[df_counts['count'] == max_nb_datasets]
                       ['model_name'])
    # test the null hypothesis using friedman before doing a post-hoc analysis
    friedman_p_value = friedmanchisquare(*(
        np.array(df_perf.loc[df_perf['model_name'] == c]['avg_macro_f1'])
        for c in classifiers))[1]
    if friedman_p_value >= alpha:
        # then the null hypothesis over the entire classifiers cannot be rejected
        STAT_SIG = 0
        print('the null hypothesis over the entire classifiers cannot be rejected')
        # exit()
    
    # get the number of classifiers
    m = len(classifiers)
    # init array that contains the p-values calculated by the Wilcoxon signed rank test
    p_values = []
    # loop through the algorithms to compare pairwise
    for i in range(m - 1):
        # get the name of classifier one
        classifier_1 = classifiers[i]
        # get the performance of classifier one
        perf_1 = np.array(df_perf.loc[df_perf['model_name'] == classifier_1]['avg_macro_f1']
                          , dtype=np.float64)
        for j in range(i + 1, m):
            # get the name of the second classifier
            classifier_2 = classifiers[j]
            # get the performance of classifier one
            perf_2 = np.array(df_perf.loc[df_perf['model_name'] == classifier_2]
                              ['avg_macro_f1'], dtype=np.float64)
            # calculate the p_value
            p_value = wilcoxon(perf_1, perf_2, zero_method='pratt')[1]
            # appen to the list
            p_values.append((classifier_1, classifier_2, p_value, False))
    # get the number of hypothesis
    k = len(p_values)
    # sort the list in acsending manner of p-value
    p_values.sort(key=operator.itemgetter(2))

    # loop through the hypothesis
    for i in range(k):
        # correct alpha with holm
        new_alpha = float(alpha / (k - i))
        # test if significant after holm's correction of alpha
        if p_values[i][2] <= new_alpha:
            p_values[i] = (p_values[i][0], p_values[i][1], p_values[i][2], True)
        else:
            # stop
            break
    # compute the average ranks to be returned (useful for drawing the cd diagram)
    # sort the dataframe of performances
    sorted_df_perf = df_perf.loc[df_perf['model_name'].isin(classifiers)]. \
        sort_values(['model_name', 'dataset_name'])
    # get the rank data
    rank_data = np.array(sorted_df_perf['avg_macro_f1']).reshape(m, max_nb_datasets)

    # create the data frame containg the accuracies
    df_ranks = pd.DataFrame(data=rank_data, index=np.sort(classifiers), columns=
    np.unique(sorted_df_perf['dataset_name']))

    # number of wins
    dfff = df_ranks.rank(ascending=False)
    print(dfff[dfff == 1.0].sum(axis=1))

    # average the ranks
    average_ranks = df_ranks.rank(ascending=False).mean(axis=1).sort_values(ascending=False)
    # return the p-values and the average ranks
    return p_values, average_ranks, max_nb_datasets


df_perf_ = pd.read_csv('CLIMABENCH_SUMMARY.csv', index_col=False)

# DistilRoBERTas ["ClimateBERT", "SciClimateBERT", "DistilRoBERTa"]
# BERTs ["CliReBERT", "CliSciBERT", "SciBERT", "BERT"]
# RoBERTa ["RoBERTa"]
# Top 3 
# All ["CliReBERT", "CliSciBERT", "SciBERT", "BERT", "ClimateBERT", "SciClimateBERT", "DistilRoBERTa", "RoBERTa"]
df_perf = df_perf_[df_perf_.model_name.isin(["CliReBERT", "CliSciBERT", "SciBERT", "BERT", "ClimateBERT", "SciClimateBERT", "DistilRoBERTa", "RoBERTa"])]

# print(df_perf.columns)

def combs(a):
    if len(a) == 0:
        return [[]]
    cs = []
    for c in combs(a[1:]):
        cs += [c, c+[a[0]]]
    return cs

from tqdm import tqdm

## For al combinations
for df_comb in tqdm(combs(df_perf["dataset_name"][:7].to_list())):
    df_perf_sub = df_perf[df_perf["dataset_name"].isin(df_comb)]
    if df_perf_sub.empty:
        continue
    else:
        draw_cd_diagram(df_perf=df_perf_sub, title='avg_macro_f1', labels=True)

# # Single Image

draw_cd_diagram(df_perf=df_perf, title='avg_macro_f1', labels=True)

  2%|▏         | 2/128 [00:00<00:13,  9.64it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliReBERT         8.0
DistilRoBERTa     7.0
SciBERT           6.0
ClimateBERT       5.0
BERT              4.0
SciClimateBERT    3.0
CliSciBERT        2.0
RoBERTa           1.0
dtype: float64
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'DistilRoBERTa', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('BERT', 'SciBERT', 1.0, False)
('BERT', 'SciClimateBERT', 1.0, False)
('CliReBERT', 'CliSciBERT', 1.0, False)
('CliReBERT', 'ClimateBERT', 1.0, False)
('CliReBERT', 'DistilRoBERTa', 1.0, False)
('CliReBERT', 'RoBERTa', 1.0, False)
('CliReBERT', 'SciBERT', 1.0, False)
('Cli

  2%|▏         | 3/128 [00:00<00:19,  6.48it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.5
SciBERT           5.5
SciClimateBERT    5.5
CliReBERT         4.5
BERT              4.0
RoBERTa           4.0
ClimateBERT       3.5
CliSciBERT        2.5
dtype: float64
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('DistilRoBERTa', 'SciBERT', 0.5, False)
('RoBERTa', 'SciClimateBERT', 0.5, False)
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'Climat

  3%|▎         | 4/128 [00:00<00:21,  5.66it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        8.0
CliReBERT         7.0
SciClimateBERT    6.0
DistilRoBERTa     5.0
SciBERT           4.0
ClimateBERT       3.0
BERT              2.0
RoBERTa           1.0
dtype: float64
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'DistilRoBERTa', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('BERT', 'SciBERT', 1.0, False)
('BERT', 'SciClimateBERT', 1.0, False)
('CliReBERT', 'CliSciBERT', 1.0, False)
('CliReBERT', 'ClimateBERT', 1.0, False)
('CliReBERT', 'DistilRoBERTa', 1.0, False)
('CliReBERT', 'RoBERTa', 1.0, False)
('CliReBERT', 'SciBERT', 1.0, False)
('Cli

  4%|▍         | 5/128 [00:00<00:23,  5.25it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliReBERT         7.5
DistilRoBERTa     6.0
CliSciBERT        5.0
SciBERT           5.0
SciClimateBERT    4.5
ClimateBERT       4.0
BERT              3.0
RoBERTa           1.0
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciBERT', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'RoBERTa', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5,

  5%|▍         | 6/128 [00:01<00:24,  4.99it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    7.0
CliSciBERT        5.5
DistilRoBERTa     5.5
SciBERT           4.5
CliReBERT         4.0
RoBERTa           4.0
BERT              3.0
ClimateBERT       2.5
dtype: float64
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('ClimateBERT', 'SciClimateBERT', 0.5, False)
('DistilRoBERTa', 'SciBERT', 0.5, False)
('DistilRoBERTa', 'SciClimateBERT', 0.5, False)
('RoBERTa', 'SciClimateBERT', 0.5, Fals

  5%|▌         | 7/128 [00:01<00:26,  4.62it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.000000
SciClimateBERT    5.666667
CliReBERT         5.333333
SciBERT           5.000000
CliSciBERT        4.333333
BERT              3.333333
ClimateBERT       3.333333
RoBERTa           3.000000
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('RoBERTa', 'SciClimateBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciClimateBERT', 0.5, Fa

  6%|▋         | 8/128 [00:01<00:27,  4.43it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           8.0
SciClimateBERT    7.0
CliSciBERT        6.0
DistilRoBERTa     5.0
BERT              4.0
CliReBERT         3.0
SciBERT           2.0
ClimateBERT       1.0
dtype: float64
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'DistilRoBERTa', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('BERT', 'SciBERT', 1.0, False)
('BERT', 'SciClimateBERT', 1.0, False)
('CliReBERT', 'CliSciBERT', 1.0, False)
('CliReBERT', 'ClimateBERT', 1.0, False)
('CliReBERT', 'DistilRoBERTa', 1.0, False)
('CliReBERT', 'RoBERTa', 1.0, False)
('CliReBERT', 'SciBERT', 1.0, False)
('Cli

  7%|▋         | 9/128 [00:01<00:26,  4.47it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.0
CliReBERT         5.5
SciClimateBERT    5.0
RoBERTa           4.5
BERT              4.0
CliSciBERT        4.0
SciBERT           4.0
ClimateBERT       3.0
dtype: float64
('BERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliReBERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('DistilRoBERTa', 'SciBERT', 0.5, False)
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('BERT', 'SciBERT', 1.0, F

  8%|▊         | 10/128 [00:02<00:25,  4.58it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.5
SciClimateBERT    7.5
DistilRoBERTa     5.5
CliSciBERT        4.5
BERT              4.0
SciBERT           3.5
CliReBERT         2.0
ClimateBERT       1.5
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'RoBERTa', 0.5, False)
('CliSciBERT', 'SciClimate

  9%|▊         | 11/128 [00:03<00:53,  2.17it/s]

Index(['RoBERTa', 'SciClimateBERT', 'DistilRoBERTa', 'CliSciBERT', 'BERT',
       'SciBERT', 'CliReBERT', 'ClimateBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.000000
SciClimateBERT    6.000000
RoBERTa           5.333333
SciBERT           4.333333
BERT              4.000000
CliReBERT         4.000000
CliSciBERT        3.666667
ClimateBERT       2.666667
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'SciClimateBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.5,

  9%|▉         | 12/128 [00:03<00:45,  2.55it/s]

Index(['DistilRoBERTa', 'SciClimateBERT', 'RoBERTa', 'SciBERT', 'BERT',
       'CliReBERT', 'CliSciBERT', 'ClimateBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        7.0
SciClimateBERT    6.5
CliReBERT         5.0
DistilRoBERTa     5.0
RoBERTa           4.5
BERT              3.0
SciBERT           3.0
ClimateBERT       2.0
dtype: float64
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliReBERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSci

 10%|█         | 13/128 [00:03<00:38,  2.97it/s]

Index(['CliSciBERT', 'SciClimateBERT', 'CliReBERT', 'DistilRoBERTa', 'RoBERTa',
       'BERT', 'SciBERT', 'ClimateBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliReBERT         6.000000
DistilRoBERTa     5.666667
CliSciBERT        5.333333
SciClimateBERT    5.333333
SciBERT           4.000000
BERT              3.333333
RoBERTa           3.333333
ClimateBERT       3.000000
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'ClimateBERT', 0.25, False)
('CliReBERT', 'SciBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, 

 11%|█         | 14/128 [00:03<00:34,  3.31it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    7.000000
CliSciBERT        5.666667
DistilRoBERTa     5.333333
RoBERTa           5.333333
CliReBERT         3.666667
SciBERT           3.666667
BERT              3.333333
ClimateBERT       2.000000
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('ClimateBERT', 'SciClimateBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'SciClimateBERT', 0.25, False)
('SciBERT', 'SciClimat

 12%|█▏        | 15/128 [00:03<00:31,  3.55it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    6.00
DistilRoBERTa     5.75
CliReBERT         4.75
CliSciBERT        4.75
RoBERTa           4.25
SciBERT           4.25
BERT              3.50
ClimateBERT       2.75
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('ClimateBERT', 'DistilRoBERTa', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('DistilRoBERTa', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'SciClimateBERT', 0.25, False)
('RoBERTa', 'SciClimateBERT', 0.25, False)
('SciBERT', 'SciClimateBERT', 0.25, False)
('BERT', 'ClimateBERT', 0.625, False)
('BERT', 'SciBER

 12%|█▎        | 16/128 [00:04<00:30,  3.72it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           8.0
SciBERT           7.0
SciClimateBERT    6.0
DistilRoBERTa     5.0
CliSciBERT        4.0
BERT              3.0
CliReBERT         2.0
ClimateBERT       1.0
dtype: float64
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'DistilRoBERTa', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('BERT', 'SciBERT', 1.0, False)
('BERT', 'SciClimateBERT', 1.0, False)
('CliReBERT', 'CliSciBERT', 1.0, False)
('CliReBERT', 'ClimateBERT', 1.0, False)
('CliReBERT', 'DistilRoBERTa', 1.0, False)
('CliReBERT', 'RoBERTa', 1.0, False)
('CliReBERT', 'SciBERT', 1.0, False)
('Cli

 13%|█▎        | 17/128 [00:04<00:28,  3.96it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.5
DistilRoBERTa     6.0
CliReBERT         5.0
RoBERTa           4.5
SciClimateBERT    4.5
BERT              3.5
CliSciBERT        3.0
ClimateBERT       3.0
dtype: float64
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('SciBERT', 'SciClimateBERT', 0.5, False)
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'Climat

 14%|█▍        | 18/128 [00:04<00:26,  4.14it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.5
SciClimateBERT    7.0
SciBERT           6.0
DistilRoBERTa     5.5
BERT              3.5
CliSciBERT        3.5
CliReBERT         1.5
ClimateBERT       1.5
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciBERT', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, Fa

 15%|█▍        | 19/128 [00:04<00:26,  4.11it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.000000
SciBERT           6.000000
SciClimateBERT    5.666667
RoBERTa           5.333333
BERT              3.666667
CliReBERT         3.666667
CliSciBERT        3.000000
ClimateBERT       2.666667
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'SciBERT', 0.25, False)
('CliSciBERT', 'SciClimateBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'RoBERT

 16%|█▌        | 20/128 [00:05<00:25,  4.28it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        6.0
SciClimateBERT    6.0
SciBERT           5.5
DistilRoBERTa     5.0
CliReBERT         4.5
RoBERTa           4.5
BERT              2.5
ClimateBERT       2.0
dtype: float64
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('ClimateBERT', 'SciClimateBERT', 0.5, False)
('DistilRoBERTa', 'SciClimateBERT', 0.5, False)
('BE

 17%|█▋        | 22/128 [00:05<00:23,  4.56it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliReBERT         5.666667
DistilRoBERTa     5.666667
SciBERT           5.666667
SciClimateBERT    5.000000
CliSciBERT        4.666667
RoBERTa           3.333333
BERT              3.000000
ClimateBERT       3.000000
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('CliReBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('ClimateBERT', 'SciClimateBERT', 0.5, False)
('DistilRoBERTa', 'RoBERTa', 0.5, False)
('RoBERTa', 'SciBERT', 0.5, False)
('BERT

 19%|█▉        | 24/128 [00:05<00:21,  4.78it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.75
SciClimateBERT    5.75
SciBERT           5.50
CliReBERT         4.50
CliSciBERT        4.25
RoBERTa           4.25
BERT              3.25
ClimateBERT       2.75
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('ClimateBERT', 'DistilRoBERTa', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.375, False)
('CliReBERT', 'ClimateBERT', 0.375, False)
('CliSciBERT', 'DistilRoBERTa', 0.375, False)
('CliSciBERT', 'Sc

 20%|█▉        | 25/128 [00:06<00:20,  5.13it/s]

BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           8.0
SciClimateBERT    6.5
CliSciBERT        5.0
DistilRoBERTa     5.0
SciBERT           4.5
BERT              3.5
CliReBERT         2.5
ClimateBERT       1.0
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'RoBERTa', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'RoBERTa', 0.

 21%|██        | 27/128 [00:06<00:18,  5.51it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.666667
SciClimateBERT    7.000000
DistilRoBERTa     5.333333
SciBERT           4.666667
CliSciBERT        4.333333
BERT              3.666667
CliReBERT         2.000000
ClimateBERT       1.333333
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'ClimateBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25, False)
('CliSciBERT', 'SciClimateBERT', 0.25, Fal

 22%|██▏       | 28/128 [00:06<00:17,  5.65it/s]

Index(['RoBERTa', 'SciClimateBERT', 'DistilRoBERTa', 'SciBERT', 'BERT',
       'CliSciBERT', 'CliReBERT', 'ClimateBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    6.333333
CliSciBERT        6.000000
RoBERTa           5.666667
DistilRoBERTa     5.000000
SciBERT           4.333333
CliReBERT         4.000000
BERT              3.000000
ClimateBERT       1.666667
dtype: float64
('BERT', 'CliSciBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'ClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)

 23%|██▎       | 30/128 [00:06<00:16,  5.96it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         0.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.50
SciClimateBERT    5.50
CliReBERT         5.00
CliSciBERT        5.00
SciBERT           4.75
RoBERTa           4.50
BERT              3.25
ClimateBERT       2.50
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'ClimateBERT', 0.125, False)
('ClimateBERT', 'DistilRoBERTa', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'SciClimateBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.625, False)
('BERT', 'ClimateBERT', 0.625, False)
('BERT', 'SciBERT', 0.625, False)
('CliReBERT', 'CliSciBERT', 0.6

 25%|██▌       | 32/128 [00:07<00:14,  6.56it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    6.0
DistilRoBERTa     5.6
RoBERTa           5.0
SciBERT           4.8
CliSciBERT        4.6
CliReBERT         4.2
BERT              3.4
ClimateBERT       2.4
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('ClimateBERT', 'DistilRoBERTa', 0.0625, False)
('ClimateBERT', 'SciBERT', 0.0625, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliSciBERT', 'ClimateBERT', 0.125, False)
('ClimateBERT', 'SciClimateBERT', 0.125, False)
('BERT', 'ClimateBERT', 0.3125, False)
('BERT', 'SciBERT', 0.3125, False)
('CliReBERT', 'CliSciBERT', 0.3125, False)
('CliReBERT', 'ClimateBERT', 0.3125, False)
('CliSciBERT', 'SciClima

 27%|██▋       | 34/128 [00:07<00:14,  6.48it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.5
ClimateBERT       6.0
SciBERT           5.5
CliReBERT         4.5
RoBERTa           4.5
BERT              3.0
CliSciBERT        3.0
SciClimateBERT    3.0
dtype: float64
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'SciBERT', 0.5, False)
('ClimateBERT', 'SciClimateBERT', 0.5, False)
('DistilRoBERTa', 'SciBERT', 0.5, False)
('DistilRoBERTa', 'SciClimateBERT', 0.5, False)
('SciBERT', 'SciClimateBERT', 0.5, False)
('BERT', 'CliReBERT', 1.0, False)
('BERT',

 27%|██▋       | 35/128 [00:07<00:14,  6.29it/s]

Index(['RoBERTa', 'DistilRoBERTa', 'SciClimateBERT', 'SciBERT', 'ClimateBERT',
       'CliSciBERT', 'BERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.333333
RoBERTa           5.333333
SciBERT           5.333333
ClimateBERT       4.666667
SciClimateBERT    4.666667
BERT              3.333333
CliReBERT         3.333333
CliSciBERT        3.000000
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', '

 29%|██▉       | 37/128 [00:07<00:14,  6.25it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        6.0
DistilRoBERTa     5.5
ClimateBERT       5.0
RoBERTa           4.5
SciBERT           4.5
SciClimateBERT    4.5
CliReBERT         4.0
BERT              2.0
dtype: float64
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('DistilRoBERTa', 'SciBERT', 0.5, False)
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('CliReBERT', 'ClimateBERT', 1.0, False)
('CliReBERT', 'DistilRoBERTa', 1.0, 

 30%|██▉       | 38/128 [00:08<00:14,  6.22it/s]

Index(['DistilRoBERTa', 'CliReBERT', 'ClimateBERT', 'SciBERT', 'CliSciBERT',
       'SciClimateBERT', 'RoBERTa', 'BERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.666667
SciClimateBERT    5.666667
RoBERTa           5.333333
CliSciBERT        5.000000
SciBERT           4.666667
ClimateBERT       4.000000
CliReBERT         3.000000
BERT              2.666667
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('BERT', 'RoBERTa', 0.5, False)
('CliReBERT',

 31%|███▏      | 40/128 [00:08<00:14,  6.27it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.00
SciBERT           5.00
SciClimateBERT    5.00
CliReBERT         4.25
CliSciBERT        4.25
ClimateBERT       4.25
RoBERTa           4.25
BERT              3.00
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('DistilRoBERTa', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('BERT', 'RoBERTa', 0.625, False)
('CliReBERT', 'DistilRoBERTa', 0.625, False)
('CliReBERT', 'RoBERTa', 0.625, False)
('CliReBERT', 'SciBERT', 0.625, 

 33%|███▎      | 42/128 [00:08<00:13,  6.18it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.000000
RoBERTa           5.666667
ClimateBERT       4.333333
SciBERT           4.333333
SciClimateBERT    4.333333
CliReBERT         4.000000
CliSciBERT        4.000000
BERT              3.333333
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'RoBERTa', 0.5, False)
('ClimateBERT', 'RoBERTa', 0.5, False)
('DistilRoBERTa', 'SciClimateBERT', 0.5, False)
('RoBERTa', 'SciBERT', 0.5, False)
('BERT', 'CliSciBERT', 0.75, False)
('BERT', 'ClimateBERT', 0.75, False)
('BERT', 'Ro

 34%|███▎      | 43/128 [00:08<00:13,  6.23it/s]

Index(['RoBERTa', 'SciClimateBERT', 'DistilRoBERTa', 'CliSciBERT', 'SciBERT',
       'BERT', 'ClimateBERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.00
RoBERTa           6.00
SciClimateBERT    5.25
SciBERT           4.50
CliSciBERT        3.75
ClimateBERT       3.75
BERT              3.50
CliReBERT         3.25
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('DistilRoBERTa', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25, False)
('ClimateBERT', 'RoBERTa', 0.25, False)
('RoBERTa', 'SciBERT', 0.

 35%|███▌      | 45/128 [00:09<00:13,  6.15it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        6.000000
RoBERTa           5.666667
DistilRoBERTa     5.333333
SciClimateBERT    5.333333
CliReBERT         3.666667
ClimateBERT       3.666667
SciBERT           3.666667
BERT              2.666667
dtype: float64
('BERT', 'CliSciBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('DistilRoBERTa', 'SciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'SciBERT', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, Fal

 36%|███▌      | 46/128 [00:09<00:13,  5.88it/s]

BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.75
CliSciBERT        5.00
CliReBERT         4.75
SciClimateBERT    4.75
RoBERTa           4.50
SciBERT           4.25
ClimateBERT       4.00
BERT              3.00
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('DistilRoBERTa', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('BERT', 'CliReBERT', 0.625, False)
('BERT', 'ClimateBERT', 0.625, False)
('BERT', 'SciBERT', 0.625, False)
('CliReBERT', 'CliSciBERT', 0.625, False)
('CliSciBERT', 'DistilRoBERTa', 0.625, False)
('ClimateBERT', 'RoBERTa', 0.625, False)
('DistilRoBERTa', 'SciClimateBERT', 0.625, False)
('RoBERTa', 'SciBERT', 0.625, False)
('CliReBERT', 'ClimateBERT', 0.875, False)
('CliReBERT', 'DistilRoBERTa', 0.875

 38%|███▊      | 48/128 [00:09<00:13,  5.82it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.8
SciClimateBERT    5.4
RoBERTa           5.0
CliSciBERT        4.6
SciBERT           4.4
CliReBERT         4.0
ClimateBERT       3.6
BERT              3.2
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('DistilRoBERTa', 'SciBERT', 0.0625, False)
('BERT', 'SciClimateBERT', 0.125, False)
('ClimateBERT', 'DistilRoBERTa', 0.1875, False)
('BERT', 'SciBERT', 0.3125, False)
('CliReBERT', 'CliSciBERT', 0.3125, False)
('CliSciBERT', 'DistilRoBERTa', 0.3125, False)
('ClimateBERT', 'RoBERTa', 0.3125, False)
('RoBERTa', 'SciBERT', 0.3125, False)
('CliReBERT', 'DistilRoBERTa', 0.4375, False)
('CliReBERT', 'SciClimat

 38%|███▊      | 49/128 [00:09<00:13,  5.89it/s]

BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           8.0
SciBERT           6.0
DistilRoBERTa     5.5
SciClimateBERT    4.5
CliSciBERT        4.0
ClimateBERT       4.0
BERT              2.5
CliReBERT         1.5
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciBERT', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'RoBERTa', 0.5, False)
('CliSciBERT', 'SciBERT', 0.5, False)
('ClimateBERT', 'RoBERTa', 0.5, False)
('DistilRoBERTa', 'RoBERTa', 0.5, False)
('RoBER

 40%|███▉      | 51/128 [00:10<00:13,  5.82it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.666667
DistilRoBERTa     5.666667
SciBERT           5.666667
SciClimateBERT    5.666667
CliSciBERT        3.666667
ClimateBERT       3.333333
BERT              3.000000
CliReBERT         1.333333
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciBERT', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25, False)
('CliS

 41%|████      | 52/128 [00:10<00:13,  5.84it/s]

BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     6.00
RoBERTa           6.00
SciBERT           5.75
SciClimateBERT    5.00
ClimateBERT       3.75
BERT              3.25
CliSciBERT        3.25
CliReBERT         3.00
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('CliSciBERT', 'DistilRoBERTa', 0.125, False)
('CliSciBERT', 'SciBERT', 0.125, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciBERT', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25, False)
('CliSciBERT', 'SciClimateBERT', 0.25, False)
('ClimateBERT', 'RoBERTa', 0.25, False)
('BERT', 'CliReBERT', 0.375, False)
('ClimateB

 41%|████▏     | 53/128 [00:10<00:12,  5.79it/s]

Index(['RoBERTa', 'CliSciBERT', 'DistilRoBERTa', 'SciBERT', 'SciClimateBERT',
       'ClimateBERT', 'CliReBERT', 'BERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.75
SciBERT           5.50
CliReBERT         4.50
CliSciBERT        4.50
RoBERTa           4.50
SciClimateBERT    4.50
ClimateBERT       4.00
BERT              2.75
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.375, False)
('CliReBERT', 'CliSciBERT', 0.375, False)
('CliSciBERT', 'DistilRoBERTa', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 

 43%|████▎     | 55/128 [00:10<00:12,  5.99it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.00
SciClimateBERT    5.75
DistilRoBERTa     5.50
SciBERT           5.25
CliSciBERT        4.75
ClimateBERT       3.25
BERT              2.75
CliReBERT         2.75
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliReBERT', 'CliSciBERT', 0.125, False)
('BERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciBERT', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'RoBERTa', 0.25, False)


 45%|████▍     | 57/128 [00:11<00:11,  6.07it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           8.000000
DistilRoBERTa     5.333333
SciClimateBERT    5.333333
CliSciBERT        4.666667
SciBERT           4.666667
BERT              3.000000
ClimateBERT       3.000000
CliReBERT         2.000000
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25, False)
('ClimateBERT', 'Ro

 45%|████▌     | 58/128 [00:11<00:11,  6.12it/s]

BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.25
DistilRoBERTa     5.75
SciBERT           5.00
SciClimateBERT    4.75
CliSciBERT        4.00
CliReBERT         3.50
ClimateBERT       3.50
BERT              3.25
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25, False)
('ClimateBERT', 'RoBERTa', 0.25, False)
('BERT', 'RoBERTa', 0.375, False)
('BERT', 'SciBERT', 0.375, False)
('CliReBERT', 'CliSciBERT', 0.375, False)
('CliReBERT', 'DistilRoBERTa', 0.375, False)
('CliReBERT', 'RoBERTa', 0.375, False)
('CliReBERT', 'SciClimateBERT', 0.375, False)
('CliSciBERT', 'SciBERT', 0.375, False)
('CliSciBERT', 'SciClimateBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('RoBERTa', 'SciBERT', 0.375, Fa

 47%|████▋     | 60/128 [00:11<00:10,  6.36it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.4
DistilRoBERTa     5.8
SciClimateBERT    5.4
SciBERT           5.0
CliSciBERT        3.8
BERT              3.4
ClimateBERT       3.2
CliReBERT         3.0
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliSciBERT', 'DistilRoBERTa', 0.125, False)
('CliSciBERT', 'RoBERTa', 0.125, False)
('ClimateBERT', 'RoBERTa', 0.125, False)
('BERT', 'RoBERTa', 0.1875, False)
('BERT', 'SciBERT', 0.1875, False)
('CliReBERT', 'CliSciBERT', 0.1875, False)
('CliReBERT', 'DistilRoBERTa', 0.1875, False)
('CliReBERT', 'RoBERTa', 0.1875, False)
('CliReBERT', 'SciClimateBERT', 0.1875, F

 48%|████▊     | 61/128 [00:11<00:10,  6.35it/s]

Index(['RoBERTa', 'CliSciBERT', 'SciClimateBERT', 'DistilRoBERTa', 'SciBERT',
       'CliReBERT', 'ClimateBERT', 'BERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.6
RoBERTa           5.2
SciClimateBERT    5.0
CliSciBERT        4.8
SciBERT           4.8
CliReBERT         4.2
ClimateBERT       3.4
BERT              3.0
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.125, False)
('ClimateBERT', 'DistilRoBERTa', 0.1875, False)
('BERT', 'CliSciBERT', 0.3125, False)
('BERT', 'SciBERT', 0.3125, False)
('CliReBERT', 'CliSciBERT', 0.3125, False)
('ClimateBERT', 'RoBERTa', 0.3125,

 49%|████▉     | 63/128 [00:12<00:11,  5.75it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.4
SciClimateBERT    6.0
DistilRoBERTa     5.4
CliSciBERT        5.0
SciBERT           4.6
BERT              3.0
CliReBERT         2.8
ClimateBERT       2.8
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.0625, False)
('CliReBERT', 'CliSciBERT', 0.0625, False)
('CliReBERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'SciClimateBERT', 0.125, False)
('ClimateBERT', 'DistilRoBERTa', 0.125, False)
('ClimateBERT', 'RoBERTa', 0.125, False)
('BERT', 'RoBERTa', 0.1875, False)
('BERT', 'SciBERT', 0.1875, False)
('CliReBERT', 'RoBERTa', 0.1875, False)
('CliSciBERT', 'ClimateBERT', 0.18

 50%|█████     | 64/128 [00:12<00:11,  5.38it/s]

BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.666667
RoBERTa           5.500000
SciClimateBERT    5.500000
SciBERT           4.833333
CliSciBERT        4.500000
CliReBERT         3.666667
BERT              3.166667
ClimateBERT       3.166667
dtype: float64
('BERT', 'DistilRoBERTa', 0.03125, False)
('BERT', 'SciClimateBERT', 0.0625, False)
('ClimateBERT', 'DistilRoBERTa', 0.09375, False)
('BERT', 'SciBERT', 0.15625, False)
('CliReBERT', 'CliSciBERT', 0.15625, False)
('ClimateBERT', 'RoBERTa', 0.15625, False)
('CliReBERT', 'DistilRoBERTa', 0.21875, False)
('CliReBERT', 'SciClimateBERT', 0.21875, False)
('CliSciBERT', 'ClimateBERT', 0.21875, False)
('CliSciBERT', 'DistilRoBERTa', 0.21875, False)
('ClimateBERT', 'SciBERT', 0.21875, False)
('BERT', 'RoBERTa', 0.3125, False)
('CliReBERT', 'RoBERTa', 0.3125, False)
('CliSciBERT', 

 51%|█████     | 65/128 [00:12<00:11,  5.43it/s]

BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           8.0
CliSciBERT        7.0
ClimateBERT       6.0
RoBERTa           5.0
SciClimateBERT    4.0
BERT              3.0
DistilRoBERTa     2.0
CliReBERT         1.0
dtype: float64
('BERT', 'CliReBERT', 1.0, False)
('BERT', 'CliSciBERT', 1.0, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'DistilRoBERTa', 1.0, False)
('BERT', 'RoBERTa', 1.0, False)
('BERT', 'SciBERT', 1.0, False)
('BERT', 'SciClimateBERT', 1.0, False)
('CliReBERT', 'CliSciBERT', 1.0, False)
('CliReBERT', 'ClimateBERT', 1.0, False)
('CliReBERT', 'DistilRoBERTa', 1.0, False)
('CliReBERT', 'RoBERTa', 1.0, False)
('CliReBERT', 'SciBERT', 1.0, False)
('CliReBERT', 'SciClimateBERT', 1.0, False)
('CliSciBERT', 'ClimateBERT', 1.0, False)
('CliSciBERT', 'DistilRoBERTa', 1.0, False)
('CliSciBERT', 'RoBERTa', 1.0, False)
('CliS

 52%|█████▏    | 67/128 [00:12<00:10,  5.88it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.5
RoBERTa           6.0
SciClimateBERT    6.0
CliSciBERT        5.0
ClimateBERT       4.0
DistilRoBERTa     4.0
BERT              3.5
CliReBERT         1.0
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciBERT', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'SciBERT', 0.

 54%|█████▍    | 69/128 [00:13<00:09,  6.09it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        7.5
SciBERT           6.0
SciClimateBERT    5.0
ClimateBERT       4.5
CliReBERT         4.0
DistilRoBERTa     3.5
RoBERTa           3.0
BERT              2.5
dtype: float64
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'ClimateBERT', 0.5, False)
('BERT', 'SciBERT', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'RoBERTa', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('ClimateBERT', 'RoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('DistilRoBERTa', 'S

 55%|█████▍    | 70/128 [00:13<00:09,  5.87it/s]

Index(['SciBERT', 'CliSciBERT', 'CliReBERT', 'ClimateBERT', 'DistilRoBERTa',
       'SciClimateBERT', 'BERT', 'RoBERTa'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        6.000000
SciClimateBERT    6.000000
SciBERT           5.666667
DistilRoBERTa     4.333333
RoBERTa           4.333333
ClimateBERT       3.666667
BERT              3.000000
CliReBERT         3.000000
dtype: float64
('BERT', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'SciClimateBERT', 0.25, F

 56%|█████▋    | 72/128 [00:13<00:09,  5.89it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           5.75
SciClimateBERT    5.25
CliSciBERT        5.00
DistilRoBERTa     5.00
CliReBERT         4.25
ClimateBERT       4.00
RoBERTa           3.50
BERT              3.25
dtype: float64
('BERT', 'SciBERT', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('RoBERTa', 'SciClimateBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliSciBERT', 'ClimateBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('CliReBERT', 'CliSciBERT', 0.625, False)
('CliSciBERT', 'DistilRoBERTa', 0.625, False)
('CliSciBERT', 'SciBERT', 0.625, False)
('ClimateBERT', 'SciClim

 57%|█████▋    | 73/128 [00:14<00:09,  5.75it/s]

BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        6.5
RoBERTa           6.5
SciClimateBERT    5.5
SciBERT           5.0
BERT              3.5
ClimateBERT       3.5
DistilRoBERTa     3.5
CliReBERT         2.0
dtype: float64
('BERT', 'CliReBERT', 0.5, False)
('BERT', 'CliSciBERT', 0.5, False)
('BERT', 'RoBERTa', 0.5, False)
('BERT', 'SciClimateBERT', 0.5, False)
('CliReBERT', 'CliSciBERT', 0.5, False)
('CliReBERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'RoBERTa', 0.5, False)
('CliReBERT', 'SciClimateBERT', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'SciBERT', 0.5, False)
('DistilRoBERTa', 'RoBERTa', 0.5, False)
('DistilRoBERTa', 'SciClimateBERT', 0.5, False)
('RoBERTa', 'SciClimateBERT', 0.5, False)
('BERT', 'ClimateBERT', 1.0, False)
('BERT', 'DistilRoBERT

 59%|█████▊    | 75/128 [00:14<00:09,  5.41it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.666667
SciClimateBERT    6.333333
CliSciBERT        5.333333
SciBERT           5.000000
DistilRoBERTa     4.333333
BERT              3.666667
ClimateBERT       3.000000
CliReBERT         1.666667
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('DistilRoBERTa', 'RoBERTa', 0.25, False)
('Distil

 59%|█████▉    | 76/128 [00:14<00:09,  5.70it/s]

BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    5.50
RoBERTa           5.25
SciBERT           5.25
DistilRoBERTa     5.00
CliSciBERT        4.50
BERT              3.75
ClimateBERT       3.50
CliReBERT         3.25
dtype: float64
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliSciBERT', 'ClimateBERT', 0.375, False)
('CliSciBERT', 'SciBERT', 0.375, False)
('CliSciBERT', 'SciClimateBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('DistilRoBERTa', 'SciBERT', 0.375, False)
('BERT', 'CliReBERT', 0.625, False)
('BERT', 'CliSciBERT', 0.625, False)
('BERT', 'ClimateBERT', 0.625, False)
('BERT', 'RoBERTa', 0.625, False)
('BERT', 'SciBERT', 0.625, False)
('CliReBERT', 'CliSciBERT', 0.625, False)
('CliReBERT', 'DistilRoBERTa', 0.625, False)
('

 61%|██████    | 78/128 [00:14<00:08,  6.14it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        5.75
SciBERT           5.00
SciClimateBERT    5.00
CliReBERT         4.75
DistilRoBERTa     4.75
ClimateBERT       3.75
RoBERTa           3.75
BERT              3.25
dtype: float64
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliSciBERT', 'ClimateBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('BERT', 'CliReBERT', 0.625, False)
('BERT', 'CliSciBERT', 0.625, False)
('BERT', 'RoBERTa', 0.625, False)
('CliReBERT', 'ClimateBERT', 0.625, False)
('CliReBERT', 'RoBERTa', 0.625, False)
('CliReBERT', 'SciBERT', 0.625, False)
(

 62%|██████▎   | 80/128 [00:15<00:07,  6.36it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    5.6
CliSciBERT        5.2
DistilRoBERTa     5.0
SciBERT           5.0
RoBERTa           4.4
CliReBERT         4.0
BERT              3.4
ClimateBERT       3.4
dtype: float64
('ClimateBERT', 'SciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('CliSciBERT', 'ClimateBERT', 0.1875, False)
('ClimateBERT', 'DistilRoBERTa', 0.1875, False)
('ClimateBERT', 'SciClimateBERT', 0.3125, False)
('DistilRoBERTa', 'SciBERT', 0.3125, False)
('RoBERTa', 'SciClimateBERT', 0.3125, False)
('BERT', 'SciBERT', 0.4375, False)
('CliReBERT', 'CliSciBERT', 0.4375, False)
('DistilRoBERTa'

 63%|██████▎   | 81/128 [00:15<00:07,  6.49it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           7.000000
DistilRoBERTa     4.666667
RoBERTa           4.666667
CliSciBERT        4.333333
SciClimateBERT    4.333333
ClimateBERT       4.000000
CliReBERT         3.666667
BERT              3.333333
dtype: float64
('BERT', 'SciBERT', 0.25, False)
('CliSciBERT', 'SciBERT', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('SciBERT', 'SciClimateBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('CliReBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'SciClimateBERT', 0.5, False)
('DistilRoBERTa', 'SciBERT', 0.5, False)
('RoBERTa', 'SciBERT', 0.5, False)
('BERT', 'CliReBERT', 0.75, False)
('BERT', 'RoBERTa',

 64%|██████▍   | 82/128 [00:16<00:21,  2.12it/s]

Index(['SciBERT', 'DistilRoBERTa', 'RoBERTa', 'CliSciBERT', 'SciClimateBERT',
       'ClimateBERT', 'CliReBERT', 'BERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.666667
SciBERT           6.666667
SciClimateBERT    6.000000
CliSciBERT        4.666667
DistilRoBERTa     4.333333
BERT              3.333333
ClimateBERT       3.000000
CliReBERT         1.333333
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 

 66%|██████▌   | 84/128 [00:16<00:13,  3.21it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.50
RoBERTa           5.25
SciClimateBERT    5.25
DistilRoBERTa     5.00
CliSciBERT        4.00
BERT              3.50
ClimateBERT       3.50
CliReBERT         3.00
dtype: float64
('BERT', 'SciBERT', 0.125, False)
('CliSciBERT', 'SciBERT', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'SciClimateBERT', 0.25, False)
('BERT', 'CliReBERT', 0.375, False)
('BERT', 'RoBERTa', 0.375, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliReBERT', 'CliSciBERT', 0.375, False)
('CliReBERT', 'DistilRoBERTa', 0.375, False)
('CliReBERT', 'RoBERTa', 0.375, False)
('Cl

 67%|██████▋   | 86/128 [00:17<00:09,  4.20it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.25
CliSciBERT        5.25
DistilRoBERTa     4.75
SciClimateBERT    4.75
CliReBERT         4.50
ClimateBERT       3.75
RoBERTa           3.75
BERT              3.00
dtype: float64
('BERT', 'SciBERT', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('RoBERTa', 'SciBERT', 0.25, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliReBERT', 'ClimateBERT', 0.375, False)
('CliSciBERT', 'ClimateBERT', 0.375, False)
('ClimateBERT', 'DistilRoBERTa', 0.375, False)
('SciBERT', 'SciClimateBERT', 0.375, False)
('BERT', 'CliSciBERT', 0.625, False)
('CliReBERT', 'CliSciBERT', 0.625, 

 68%|██████▊   | 87/128 [00:17<00:08,  4.58it/s]

Index(['SciBERT', 'SciClimateBERT', 'CliSciBERT', 'RoBERTa', 'DistilRoBERTa',
       'BERT', 'ClimateBERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.0
SciClimateBERT    5.4
DistilRoBERTa     5.0
CliSciBERT        4.8
RoBERTa           4.4
CliReBERT         3.8
ClimateBERT       3.4
BERT              3.2
dtype: float64
('BERT', 'SciBERT', 0.0625, False)
('ClimateBERT', 'SciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('CliSciBERT', 'ClimateBERT', 0.1875, False)
('ClimateBERT', 'DistilRoBERTa', 0.1875, False)
('CliReBERT', 'CliSciBERT', 

 70%|██████▉   | 89/128 [00:17<00:07,  5.40it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.000000
CliSciBERT        5.666667
SciBERT           5.666667
SciClimateBERT    5.666667
DistilRoBERTa     4.000000
BERT              3.333333
ClimateBERT       2.666667
CliReBERT         2.000000
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'ClimateBERT', 0.25, False)
('ClimateBERT', 'SciBERT', 0.25, False)
('DistilRoBERT

 70%|███████   | 90/128 [00:17<00:06,  5.68it/s]

Index(['SciBERT', 'RoBERTa', 'SciClimateBERT', 'CliSciBERT', 'DistilRoBERTa',
       'BERT', 'CliReBERT', 'ClimateBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.00
SciClimateBERT    6.25
SciBERT           5.50
CliSciBERT        5.00
DistilRoBERTa     4.50
BERT              3.50
ClimateBERT       2.50
CliReBERT         1.75
dtype: float64
('BERT', 'CliReBERT', 0.125, False)
('BERT', 'RoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliReBERT', 'CliSciBERT', 0.125, False)
('CliReBERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'RoBERTa', 0.125, False)
('CliReBERT', 'SciClimateBERT', 0.125, False)
('CliSciBERT', 'ClimateBERT', 0.125, False)
('ClimateBER

 72%|███████▏  | 92/128 [00:18<00:06,  5.71it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           5.8
SciBERT           5.6
SciClimateBERT    5.6
DistilRoBERTa     5.0
CliSciBERT        4.4
BERT              3.6
CliReBERT         3.0
ClimateBERT       3.0
dtype: float64
('ClimateBERT', 'SciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('CliSciBERT', 'ClimateBERT', 0.1875, False)
('CliSciBERT', 'SciBERT', 0.1875, False)
('CliSciBERT', 'SciClimateBERT', 0.1875, False)
('ClimateBERT', 'DistilRoBERTa', 0.1875, False)
('BERT', 'CliReBERT', 0.3125, False)
('BERT', 'ClimateBERT', 0.3125, False)
('BERT', 'RoBERTa', 0.3125, False)
('BERT', 'SciBERT', 0.3125, Fals

 73%|███████▎  | 93/128 [00:18<00:06,  5.40it/s]

BERT              0.0
CliReBERT         1.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        6.25
SciClimateBERT    5.75
RoBERTa           5.50
SciBERT           5.25
DistilRoBERTa     4.25
CliReBERT         3.25
BERT              3.00
ClimateBERT       2.75
dtype: float64
('BERT', 'CliSciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliReBERT', 'CliSciBERT', 0.125, False)
('CliSciBERT', 'ClimateBERT', 0.125, False)
('ClimateBERT', 'SciBERT', 0.125, False)
('DistilRoBERTa', 'SciClimateBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('ClimateBERT', 'DistilRoBERTa', 0.25, False)
('ClimateBERT', 'SciClimateBERT', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.375, False)
('BERT', 'CliReBERT', 0.625, False)
('BERT', 'ClimateBERT', 0.625, False)
('BERT', 'RoBE

 73%|███████▎  | 94/128 [00:18<00:06,  5.46it/s]

Index(['CliSciBERT', 'SciBERT', 'SciClimateBERT', 'DistilRoBERTa', 'RoBERTa',
       'CliReBERT', 'BERT', 'ClimateBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    6.2
RoBERTa           5.8
CliSciBERT        5.6
SciBERT           5.2
DistilRoBERTa     4.6
BERT              3.2
CliReBERT         2.8
ClimateBERT       2.6
dtype: float64
('BERT', 'SciClimateBERT', 0.0625, False)
('CliReBERT', 'CliSciBERT', 0.0625, False)
('CliSciBERT', 'ClimateBERT', 0.0625, False)
('ClimateBERT', 'SciBERT', 0.0625, False)
('DistilRoBERTa', 'SciClimateBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'Di

 75%|███████▌  | 96/128 [00:18<00:05,  5.95it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciClimateBERT    5.666667
SciBERT           5.333333
CliSciBERT        5.000000
DistilRoBERTa     5.000000
RoBERTa           5.000000
CliReBERT         3.666667
BERT              3.333333
ClimateBERT       3.000000
dtype: float64
('ClimateBERT', 'SciBERT', 0.03125, False)
('BERT', 'DistilRoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.09375, False)
('CliSciBERT', 'ClimateBERT', 0.09375, False)
('ClimateBERT', 'DistilRoBERTa', 0.09375, False)
('ClimateBERT', 'SciClimateBERT', 0.15625, False)
('BERT', 'SciBERT', 0.21875, False)
('CliReBERT', 'CliSciBERT', 0.21875, False)
('CliReBERT', 'DistilRoBERTa', 0.3125, False)
('CliReBERT

 77%|███████▋  | 98/128 [00:19<00:04,  6.36it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.333333
ClimateBERT       6.000000
DistilRoBERTa     5.000000
RoBERTa           4.666667
CliSciBERT        4.333333
CliReBERT         3.333333
SciClimateBERT    3.333333
BERT              3.000000
dtype: float64
('BERT', 'ClimateBERT', 0.25, False)
('BERT', 'SciBERT', 0.25, False)
('CliSciBERT', 'SciBERT', 0.25, False)
('ClimateBERT', 'SciClimateBERT', 0.25, False)
('SciBERT', 'SciClimateBERT', 0.25, False)
('BERT', 'DistilRoBERTa', 0.5, False)
('CliSciBERT', 'ClimateBERT', 0.5, False)
('CliSciBERT', 'DistilRoBERTa', 0.5, False)
('ClimateBERT', 'DistilRoBERTa', 0.5, False)
('DistilRoBERTa', 'SciClimateBERT', 0.5, 

 78%|███████▊  | 100/128 [00:19<00:04,  6.63it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.00
DistilRoBERTa     5.25
RoBERTa           5.25
ClimateBERT       5.00
SciClimateBERT    4.50
CliSciBERT        4.00
BERT              3.25
CliReBERT         2.75
dtype: float64
('BERT', 'SciBERT', 0.125, False)
('CliSciBERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'RoBERTa', 0.375, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliReBERT', 'CliSciBERT', 0.375, False)
('CliReBERT', 'DistilRoBERTa', 0.375, False)
('CliReBERT', 'RoBERTa', 0.375, False)
('CliReBERT', 'SciBERT', 0.375, False)
('CliReBERT', 'SciClimateBERT', 0.375, Fals

 80%|███████▉  | 102/128 [00:19<00:03,  6.63it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           5.75
CliSciBERT        5.25
ClimateBERT       5.25
DistilRoBERTa     5.00
CliReBERT         4.25
SciClimateBERT    4.00
RoBERTa           3.75
BERT              2.75
dtype: float64
('BERT', 'ClimateBERT', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'CliSciBERT', 0.375, False)
('BERT', 'SciClimateBERT', 0.375, False)
('SciBERT', 'SciClimateBERT', 0.375, False)
('BERT', 'CliReBERT', 0.625, False)
('CliReBERT', 'CliSciBERT', 0.625, False)
('CliSciBERT', 'DistilRoBERTa', 0.625, False)
('CliSciBERT', 'SciClimateBERT', 0.625, False)
('ClimateBERT', 'SciClimateBERT', 0.6

 80%|████████  | 103/128 [00:19<00:03,  6.41it/s]

Index(['CliSciBERT', 'SciBERT', 'RoBERTa', 'SciClimateBERT', 'DistilRoBERTa',
       'ClimateBERT', 'BERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       0.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           5.6
DistilRoBERTa     5.2
CliSciBERT        4.8
SciClimateBERT    4.8
ClimateBERT       4.6
RoBERTa           4.4
CliReBERT         3.6
BERT              3.0
dtype: float64
('BERT', 'SciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('CliReBERT', 'CliSciBERT', 0.3125, False)
('CliSciBERT', 'DistilRoBERTa', 0.3125, False)
('CliReBERT', 'DistilRoBERTa', 0.4375, False)
('CliReBERT', 'SciBERT', 0.

 82%|████████▏ | 105/128 [00:20<00:03,  5.97it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.000000
CliSciBERT        5.666667
SciBERT           5.000000
ClimateBERT       4.666667
SciClimateBERT    4.666667
DistilRoBERTa     4.333333
BERT              3.000000
CliReBERT         1.666667
dtype: float64
('BERT', 'CliReBERT', 0.25, False)
('BERT', 'CliSciBERT', 0.25, False)
('BERT', 'RoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.25, False)
('CliReBERT', 'CliSciBERT', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'RoBERTa', 0.25, False)
('CliReBERT', 'SciClimateBERT', 0.25, False)
('DistilRoBERTa', 'RoBERTa', 0.25, False)
('RoBERTa', 'SciClimateBERT', 0.25, False)
('BERT', 'Di

 83%|████████▎ | 106/128 [00:20<00:03,  6.05it/s]

BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           5.50
SciBERT           5.25
DistilRoBERTa     5.00
CliSciBERT        4.75
ClimateBERT       4.75
SciClimateBERT    4.25
BERT              3.25
CliReBERT         3.25
dtype: float64
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'SciClimateBERT', 0.375, False)
('BERT', 'CliSciBERT', 0.625, False)
('BERT', 'ClimateBERT', 0.625, False)
('BERT', 'RoBERTa', 0.625, False)
('BERT', 'SciBERT', 0.625, False)
('CliReBERT', 'CliSciBERT', 0.625, False)
('CliReBERT', 'DistilRoBERTa', 0.625, False)
('CliReBERT', 'RoBERTa', 0.625, False)
('CliReBERT', 'SciClimateBERT', 0.625, False)
('CliSciBERT', 'DistilRoBERTa', 0.625, False)
('CliSciBERT', 'RoBERTa', 0.625, False)
('CliSciBERT', 'SciBERT', 0.625, False)
('ClimateBERT', 'RoBERTa', 0.625, False)
('ClimateBERT', 'SciClimateBERT', 0.625, False)


 84%|████████▍ | 108/128 [00:20<00:03,  6.26it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           5.8
DistilRoBERTa     5.2
SciBERT           5.2
SciClimateBERT    5.0
CliSciBERT        4.4
ClimateBERT       4.2
BERT              3.4
CliReBERT         2.8
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('BERT', 'RoBERTa', 0.3125, False)
('BERT', 'SciBERT', 0.3125, False)
('CliReBERT', 'CliSciBERT', 0.3125, False)
('CliReBERT', 'DistilRoBERTa', 0.3125, False)
('CliReBERT', 'RoBERTa', 0.3125, False)
('CliReBERT', 'SciClimateBERT', 0.3125, False)
('CliSciBERT', 'DistilRoBERTa', 0.3125, False)
('CliSciBERT', 'RoBERTa', 0.3125, False)
('CliSciBERT', 'SciBERT', 0.3125,

 86%|████████▌ | 110/128 [00:21<00:02,  6.69it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
CliSciBERT        5.4
DistilRoBERTa     5.0
SciBERT           5.0
RoBERTa           4.6
SciClimateBERT    4.6
ClimateBERT       4.4
CliReBERT         4.0
BERT              3.0
dtype: float64
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('BERT', 'CliSciBERT', 0.3125, False)
('BERT', 'ClimateBERT', 0.4375, False)
('BERT', 'SciBERT', 0.4375, False)
('CliReBERT', 'CliSciBERT', 0.4375, False)
('DistilRoBERTa', 'SciBERT', 0.4375, False)
('CliReBERT', 'DistilRoBERTa', 0.625, False)
('CliReBERT', 'SciClimateBERT', 0.625, False)
('CliSciBERT', 'SciClimateBERT', 0.625, False)
('ClimateBERT', 'DistilRoBERTa

 88%|████████▊ | 112/128 [00:21<00:02,  6.62it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
DistilRoBERTa     5.166667
SciClimateBERT    5.166667
CliSciBERT        5.000000
RoBERTa           5.000000
SciBERT           5.000000
ClimateBERT       4.000000
CliReBERT         3.500000
BERT              3.166667
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.09375, False)
('BERT', 'SciBERT', 0.21875, False)
('CliReBERT', 'CliSciBERT', 0.21875, False)
('DistilRoBERTa', 'SciBERT', 0.21875, False)
('CliReBERT', 'DistilRoBERTa', 0.3125, False)
('CliReBERT', 'SciClimateBERT', 0.3125, False)
('ClimateBERT', 'DistilRoBERTa', 0.3125, False)
('ClimateBERT', 'SciBERT', 0.3125, False)
('CliSciBERT', 'C

 88%|████████▊ | 113/128 [00:21<00:02,  6.53it/s]

Index(['RoBERTa', 'SciBERT', 'CliSciBERT', 'ClimateBERT', 'DistilRoBERTa',
       'SciClimateBERT', 'BERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.50
RoBERTa           5.50
DistilRoBERTa     5.00
ClimateBERT       4.75
CliSciBERT        4.25
SciClimateBERT    4.00
BERT              3.00
CliReBERT         3.00
dtype: float64
('BERT', 'SciBERT', 0.125, False)
('CliSciBERT', 'SciBERT', 0.125, False)
('SciBERT', 'SciClimateBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('BERT', 'RoBERTa', 0.375, False)
('BERT', 'SciClimateBERT', 0.375, False)
('CliReBERT', 'CliSciBERT', 0.375, False

 90%|████████▉ | 115/128 [00:21<00:02,  6.31it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.00
SciBERT           6.25
SciClimateBERT    5.25
DistilRoBERTa     4.75
CliSciBERT        4.50
ClimateBERT       4.00
BERT              3.00
CliReBERT         1.25
dtype: float64
('BERT', 'CliReBERT', 0.125, False)
('BERT', 'RoBERTa', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliReBERT', 'CliSciBERT', 0.125, False)
('CliReBERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'RoBERTa', 0.125, False)
('CliReBERT', 'SciBERT', 0.125, False)
('CliReBERT', 'SciClimateBERT', 0.125, False)
('CliSciBERT', 'SciBERT', 0.125, False)
('DistilRoBERTa', 'RoBERTa', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('CliSciBERT', 'RoBERTa', 0.25

 91%|█████████ | 116/128 [00:21<00:01,  6.13it/s]

Index(['SciBERT', 'RoBERTa', 'DistilRoBERTa', 'SciClimateBERT', 'ClimateBERT',
       'CliSciBERT', 'BERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.00
CliSciBERT        5.75
RoBERTa           5.50
SciClimateBERT    4.75
DistilRoBERTa     4.50
ClimateBERT       4.25
CliReBERT         2.75
BERT              2.50
dtype: float64
('BERT', 'CliSciBERT', 0.125, False)
('BERT', 'SciBERT', 0.125, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliReBERT', 'CliSciBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'DistilRoBERTa', 0.25, False)
('CliReBERT', 'SciBERT', 0.25, Fals

 92%|█████████▏| 118/128 [00:22<00:01,  5.75it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
SciBERT           6.0
CliSciBERT        5.0
DistilRoBERTa     5.0
RoBERTa           4.6
ClimateBERT       4.4
SciClimateBERT    4.4
CliReBERT         3.8
BERT              2.8
dtype: float64
('BERT', 'SciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.1875, False)
('SciBERT', 'SciClimateBERT', 0.1875, False)
('BERT', 'CliSciBERT', 0.3125, False)
('CliReBERT', 'CliSciBERT', 0.3125, False)
('BERT', 'ClimateBERT', 0.4375, False)
('CliReBERT', 'DistilRoBERTa', 0.4375, False)
('CliReBERT', 'SciBERT', 0.4375, False)
('CliReBERT', 'SciClimateBERT', 0.4375, False)
('CliSciBERT', 'SciBERT', 0.4375, 

 93%|█████████▎| 119/128 [00:22<00:01,  5.96it/s]

BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       1.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           5.8
SciBERT           5.8
SciClimateBERT    5.4
CliSciBERT        5.2
DistilRoBERTa     4.8
ClimateBERT       3.8
BERT              2.8
CliReBERT         2.4
dtype: float64
('BERT', 'SciBERT', 0.0625, False)
('BERT', 'SciClimateBERT', 0.0625, False)
('CliReBERT', 'CliSciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'SciBERT', 0.125, False)
('CliReBERT', 'SciClimateBERT', 0.125, False)
('BERT', 'RoBERTa', 0.1875, False)
('CliReBERT', 'RoBERTa', 0.1875, False)
('BERT', 'CliReBERT', 0.3125, False)
('CliSciBERT', 'ClimateBERT', 0.3125, False)
('CliSciBERT', 'SciBERT', 0.3125, False)
('ClimateBERT', 'DistilRoBERTa', 0.3125, False)
('ClimateBERT', 'RoBERTa', 0.3125, False)
('ClimateBERT', 'SciBERT', 0.3125, False

 95%|█████████▍| 121/128 [00:22<00:01,  5.99it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.25
SciBERT           5.50
CliSciBERT        5.25
SciClimateBERT    5.00
DistilRoBERTa     4.50
ClimateBERT       3.75
BERT              3.00
CliReBERT         1.75
dtype: float64
('BERT', 'CliReBERT', 0.125, False)
('BERT', 'CliSciBERT', 0.125, False)
('BERT', 'RoBERTa', 0.125, False)
('BERT', 'SciClimateBERT', 0.125, False)
('CliReBERT', 'CliSciBERT', 0.125, False)
('CliReBERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'RoBERTa', 0.125, False)
('CliReBERT', 'SciClimateBERT', 0.125, False)
('DistilRoBERTa', 'RoBERTa', 0.125, False)
('RoBERTa', 'SciClimateBERT', 0.125, False)
('BERT', 'DistilRoBERTa', 0.25, Fa

 95%|█████████▌| 122/128 [00:22<00:00,  6.10it/s]

Index(['RoBERTa', 'SciBERT', 'DistilRoBERTa', 'CliSciBERT', 'SciClimateBERT',
       'ClimateBERT', 'BERT', 'CliReBERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           0.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           7.2
SciClimateBERT    5.6
SciBERT           5.4
CliSciBERT        4.8
DistilRoBERTa     4.8
ClimateBERT       3.4
BERT              3.2
CliReBERT         1.6
dtype: float64
('BERT', 'CliReBERT', 0.0625, False)
('BERT', 'RoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.0625, False)
('CliReBERT', 'CliSciBERT', 0.0625, False)
('CliReBERT', 'DistilRoBERTa', 0.0625, False)
('CliReBERT', 'RoBERTa', 0.0625, False)
('CliReBERT', 'SciClimateBERT', 0.0625, False)
('DistilRoBERTa', 'RoBERTa', 0.0625, False)
('BERT', 'Dis

 97%|█████████▋| 124/128 [00:23<00:00,  5.52it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.166667
SciBERT           5.500000
DistilRoBERTa     5.166667
SciClimateBERT    5.166667
CliSciBERT        4.333333
ClimateBERT       3.666667
BERT              3.333333
CliReBERT         2.666667
dtype: float64
('BERT', 'DistilRoBERTa', 0.0625, False)
('BERT', 'SciClimateBERT', 0.09375, False)
('BERT', 'RoBERTa', 0.15625, False)
('BERT', 'SciBERT', 0.15625, False)
('CliReBERT', 'CliSciBERT', 0.15625, False)
('CliReBERT', 'DistilRoBERTa', 0.15625, False)
('CliReBERT', 'RoBERTa', 0.15625, False)
('CliReBERT', 'SciClimateBERT', 0.15625, False)
('CliSciBERT', 'RoBERTa', 0.15625, False)
('CliSciBERT', 'SciBERT', 0.156

 98%|█████████▊| 125/128 [00:23<00:00,  5.66it/s]

BERT              0.0
CliReBERT         2.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.0
CliSciBERT        5.8
SciBERT           5.2
SciClimateBERT    5.2
DistilRoBERTa     4.6
ClimateBERT       3.6
BERT              2.8
CliReBERT         2.8
dtype: float64
('BERT', 'CliSciBERT', 0.0625, False)
('BERT', 'SciClimateBERT', 0.0625, False)
('CliReBERT', 'CliSciBERT', 0.0625, False)
('BERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'DistilRoBERTa', 0.125, False)
('CliReBERT', 'SciClimateBERT', 0.125, False)
('BERT', 'RoBERTa', 0.3125, False)
('BERT', 'SciBERT', 0.3125, False)
('CliReBERT', 'RoBERTa', 0.3125, False)
('CliReBERT', 'SciBERT', 0.3125, False)
('CliSciBERT', 'ClimateBERT', 0.3125, False)
('ClimateBERT', 'DistilRoBERTa', 0.3125, False)
('ClimateBERT', 'RoBERTa', 0.3125, False)
('BERT', 'CliReBERT', 0.4375, False)
('ClimateBERT', 'SciBERT', 0.4375, False)


 99%|█████████▉| 127/128 [00:23<00:00,  5.92it/s]

['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           1.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           6.166667
SciClimateBERT    5.666667
CliSciBERT        5.333333
SciBERT           5.166667
DistilRoBERTa     4.833333
ClimateBERT       3.333333
BERT              3.000000
CliReBERT         2.500000
dtype: float64
('BERT', 'SciClimateBERT', 0.03125, False)
('CliReBERT', 'CliSciBERT', 0.03125, False)
('BERT', 'DistilRoBERTa', 0.0625, False)
('CliReBERT', 'DistilRoBERTa', 0.0625, False)
('CliReBERT', 'SciClimateBERT', 0.0625, False)
('BERT', 'RoBERTa', 0.15625, False)
('BERT', 'SciBERT', 0.15625, False)
('CliReBERT', 'RoBERTa', 0.15625, False)
('CliReBERT', 'SciBERT', 0.15625, False)
('CliSciBERT', 'ClimateBERT', 0.15

100%|██████████| 128/128 [00:24<00:00,  5.33it/s]


Index(['RoBERTa', 'SciBERT', 'SciClimateBERT', 'DistilRoBERTa', 'CliSciBERT',
       'ClimateBERT', 'CliReBERT', 'BERT'],
      dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7]
['BERT' 'ClimateBERT' 'CliReBERT' 'CliSciBERT' 'DistilRoBERTa' 'RoBERTa'
 'SciBERT' 'SciClimateBERT']
the null hypothesis over the entire classifiers cannot be rejected
BERT              0.0
CliReBERT         3.0
CliSciBERT        0.0
ClimateBERT       2.0
DistilRoBERTa     0.0
RoBERTa           2.0
SciBERT           0.0
SciClimateBERT    0.0
dtype: float64
RoBERTa           5.428571
SciBERT           5.285714
SciClimateBERT    5.285714
DistilRoBERTa     5.142857
CliSciBERT        4.857143
ClimateBERT       3.571429
CliReBERT         3.285714
BERT              3.142857
dtype: float64
('BERT', 'DistilRoBERTa', 0.03125, False)
('BERT', 'SciClimateBERT', 0.046875, False)
('BERT', 'SciBERT', 0.109375, False)
('CliReBERT', 'CliSciBERT', 0.109375, False)
('CliReBERT', 'DistilRoBERTa', 0.15625, False)
('CliReBERT', 'SciClimate