In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'Author'
__email__ = 'Email'

# SemEval 2026 Task 5 - Ensemble

In [2]:
# dependency
# built-in
import json
import os
import sys
from pathlib import Path
from itertools import combinations

# third-party
import random
import pandas as pd
import numpy as np
from scipy import stats
from xgboost import XGBRanker, XGBRegressor
from scipy.stats import spearmanr
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# local - add src/eval to path for importing evaluation functions
sys.path.insert(0, str(Path('../src/eval').resolve()))
# Import evaluation functions from src/eval/scoring.py
import scoring

%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

Importing...
Starting Scoring script...


# Init

In [23]:
# helper
def set_seed(seed=42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)

def load_predictions(filepath):
    """Load predictions from a JSONL file into a dictionary."""
    predictions = {}
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            predictions[data['id']] = data['prediction']
    return predictions

def convert_json_to_submittable_jsonl(input_json_path, output_jsonl_path):
    """
    Convert from detailed JSON format to submittable JSONL format.
    
    Input format (JSON array):
        [{"id": "0", "homonym": "...", "model_score": 5, ...}, ...]
    
    Output format (JSONL):
        {"id": "0", "prediction": 5}
        {"id": "1", "prediction": 2}
        ...
    
    Args:
        input_json_path: Path to input JSON file (e.g., chatgpt_v1.json)
        output_jsonl_path: Path to output JSONL file (e.g., chatgpt_v1_submission.jsonl)
    """
    # Read the JSON array
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    # Write to JSONL format
    with open(output_jsonl_path, 'w') as f:
        for item in data:
            json_obj = {
                "id": item["id"],
                "prediction": int(item["model_score"])
            }
            f.write(json.dumps(json_obj) + '\n')

    print(f"Converted {len(data)} predictions")
    print(f"Saved to: {output_jsonl_path}")

# Evaluation functions following the structure of src/eval/scoring.py
def evaluate_predictions_array(y_pred, y_true_labels):
    """
    Evaluate predictions using the same logic as scoring.py
    
    Args:
        y_pred: array of predictions
        y_true_labels: list of gold label lists (5 ratings each)
    
    Returns:
        dict with spearman and accuracy scores
    """
    # Build prediction and gold lists (same structure as scoring.py)
    pred_list = list(y_pred)
    gold_list = [scoring.get_average(labels) for labels in y_true_labels]
    
    # Calculate Spearman correlation (same as scoring.py)
    corr, p_value = spearmanr(pred_list, gold_list)
    
    # Calculate accuracy within SD (same logic as scoring.py)
    correct_guesses = 0
    wrong_guesses = 0
    
    for pred, labels in zip(pred_list, y_true_labels):
        if scoring.is_within_standard_deviation(pred, labels):
            correct_guesses += 1
        else:
            wrong_guesses += 1
    
    accuracy = correct_guesses / (correct_guesses + wrong_guesses)
    
    return {
        'spearman': corr,
        'p_value': p_value,
        'accuracy': accuracy,
        'correct': correct_guesses,
        'total': correct_guesses + wrong_guesses
    }

In [18]:
# init
set_seed(0)

## Data

In [26]:
# import glob
# from pathlib import Path

# base_path = '../res/results/test/*.json'

# json_files = glob.glob(base_path)
# for json_file in json_files:
#     json_path = Path(json_file)
#     output_path = json_file.replace('json', 'jsonl')
#     convert_json_to_submittable_jsonl(json_file, output_path)

In [27]:
# Load gold labels (solution file)
SOLUTION_FILE = Path("../res/data/dev_solution.jsonl")

gold_labels = {}
with open(SOLUTION_FILE, 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        gold_labels[data['id']] = data['label']

print(f"Loaded {len(gold_labels)} gold labels")
print(f"\nGold labels are lists of 5 human ratings (1-5 scale)")
print(f"\nExample gold labels:")
for i in range(5):
    sample_id = str(i)
    print(f"  ID {sample_id}: {gold_labels[sample_id]} (avg={np.mean(gold_labels[sample_id]):.2f}, std={np.std(gold_labels[sample_id], ddof=1):.2f})")

Loaded 588 gold labels

Gold labels are lists of 5 human ratings (1-5 scale)

Example gold labels:
  ID 0: [4, 5, 3, 1, 5] (avg=3.60, std=1.67)
  ID 1: [3, 3, 4, 4, 4] (avg=3.60, std=0.55)
  ID 2: [5, 5, 2, 3, 4] (avg=3.80, std=1.30)
  ID 3: [4, 5, 4, 3, 5] (avg=4.20, std=0.84)
  ID 4: [1, 5, 4, 4, 1] (avg=3.00, std=1.87)


## System

In [28]:
# Path to individual system outputs
RESULTS_DIR = Path("../res/results/dev/")

# Get all jsonl files
prediction_files = sorted(RESULTS_DIR.glob("*.jsonl"))
print(f"Found {len(prediction_files)} prediction files:")
for f in prediction_files:
    print(f"  - {f.name}")

Found 11 prediction files:
  - chatgpt_v1.jsonl
  - david_v1.jsonl
  - david_v2.jsonl
  - david_v3.jsonl
  - deepseek.jsonl
  - korean_v1.jsonl
  - korean_v2.jsonl
  - qwen_v1.jsonl
  - qwen_v2.jsonl
  - urdu_v2.jsonl
  - urdu_v3.jsonl


In [29]:
# Load all predictions
all_predictions = {}
for pred_file in prediction_files:
    system_name = pred_file.stem  # filename without extension
    all_predictions[system_name] = load_predictions(pred_file)
    print(f"Loaded {len(all_predictions[system_name])} predictions from {system_name}")

# Convert to DataFrame for easier manipulation
# Each row is a sample, each column is a system's prediction
df_predictions = pd.DataFrame(all_predictions)
df_predictions.index.name = 'id'

print(f"\nPredictions DataFrame shape: {df_predictions.shape}")
print(f"Number of samples: {len(df_predictions)}")
print(f"Number of systems: {len(df_predictions.columns)}")
print(f"\nSystems: {list(df_predictions.columns)}")
print(f"\nSample data types:")
print(df_predictions.dtypes)
print(f"\nFirst 10 predictions:")
df_predictions.head(10)

Loaded 588 predictions from chatgpt_v1
Loaded 588 predictions from david_v1
Loaded 588 predictions from david_v2
Loaded 588 predictions from david_v3
Loaded 588 predictions from deepseek
Loaded 588 predictions from korean_v1
Loaded 588 predictions from korean_v2
Loaded 588 predictions from qwen_v1
Loaded 588 predictions from qwen_v2
Loaded 588 predictions from urdu_v2
Loaded 588 predictions from urdu_v3

Predictions DataFrame shape: (588, 11)
Number of samples: 588
Number of systems: 11

Systems: ['chatgpt_v1', 'david_v1', 'david_v2', 'david_v3', 'deepseek', 'korean_v1', 'korean_v2', 'qwen_v1', 'qwen_v2', 'urdu_v2', 'urdu_v3']

Sample data types:
chatgpt_v1      int64
david_v1        int64
david_v2      float64
david_v3        int64
deepseek        int64
korean_v1     float64
korean_v2     float64
qwen_v1         int64
qwen_v2         int64
urdu_v2         int64
urdu_v3         int64
dtype: object

First 10 predictions:


Unnamed: 0_level_0,chatgpt_v1,david_v1,david_v2,david_v3,deepseek,korean_v1,korean_v2,qwen_v1,qwen_v2,urdu_v2,urdu_v3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,5,4,4.06,1,4,4.360252,4.530886,5,4,4,4
1,2,4,1.94,5,2,2.586742,2.27588,1,2,2,3
2,3,5,3.9976,1,3,2.647276,3.40803,3,3,3,4
3,2,5,2.0024,5,3,3.392001,3.437395,3,3,4,4
4,3,5,3.548,5,3,2.816402,3.001934,5,3,4,4
5,2,4,2.452,5,3,3.373284,3.141558,2,3,3,4
6,5,4,4.5992,1,4,4.360252,4.341353,1,4,4,2
7,1,1,1.4008,1,2,1.942182,1.74466,1,2,3,2
8,1,3,2.8696,1,4,1.942182,2.118062,5,4,2,2
9,3,3,3.1304,5,2,3.45069,3.896887,1,2,5,4


In [30]:
# Create a combined DataFrame with predictions and gold labels
df_gold = pd.DataFrame({
    'gold_labels': gold_labels,
    'gold_avg': {k: np.mean(v) for k, v in gold_labels.items()},
    'gold_std': {k: np.std(v, ddof=1) for k, v in gold_labels.items()}
})

# Combine predictions with gold labels
df_combined = df_predictions.join(df_gold)

print(f"Combined DataFrame shape: {df_combined.shape}")
print(f"\nColumns: {list(df_combined.columns)}")
print(f"\nFirst 10 rows:")
df_combined.head(10)

Combined DataFrame shape: (588, 14)

Columns: ['chatgpt_v1', 'david_v1', 'david_v2', 'david_v3', 'deepseek', 'korean_v1', 'korean_v2', 'qwen_v1', 'qwen_v2', 'urdu_v2', 'urdu_v3', 'gold_labels', 'gold_avg', 'gold_std']

First 10 rows:


Unnamed: 0_level_0,chatgpt_v1,david_v1,david_v2,david_v3,deepseek,korean_v1,korean_v2,qwen_v1,qwen_v2,urdu_v2,urdu_v3,gold_labels,gold_avg,gold_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,5,4,4.06,1,4,4.360252,4.530886,5,4,4,4,"[4, 5, 3, 1, 5]",3.6,1.67332
1,2,4,1.94,5,2,2.586742,2.27588,1,2,2,3,"[3, 3, 4, 4, 4]",3.6,0.547723
2,3,5,3.9976,1,3,2.647276,3.40803,3,3,3,4,"[5, 5, 2, 3, 4]",3.8,1.30384
3,2,5,2.0024,5,3,3.392001,3.437395,3,3,4,4,"[4, 5, 4, 3, 5]",4.2,0.83666
4,3,5,3.548,5,3,2.816402,3.001934,5,3,4,4,"[1, 5, 4, 4, 1]",3.0,1.870829
5,2,4,2.452,5,3,3.373284,3.141558,2,3,3,4,"[4, 3, 4, 1, 3]",3.0,1.224745
6,5,4,4.5992,1,4,4.360252,4.341353,1,4,4,2,"[4, 4, 5, 5, 5]",4.6,0.547723
7,1,1,1.4008,1,2,1.942182,1.74466,1,2,3,2,"[1, 1, 1, 2, 2, 1]",1.333333,0.516398
8,1,3,2.8696,1,4,1.942182,2.118062,5,4,2,2,"[4, 1, 1, 2, 3]",2.2,1.30384
9,3,3,3.1304,5,2,3.45069,3.896887,1,2,5,4,"[4, 2, 5, 4, 4]",3.8,1.095445


# Split Data

In [31]:
# Use stratification based on binned gold_avg to ensure balanced distribution
df_combined['gold_bin'] = pd.cut(df_combined['gold_avg'], bins=5, labels=False)

train_df, test_df = train_test_split(
    df_combined, 
    test_size=0.3, 
    random_state=42,
    stratify=df_combined['gold_bin']
)

# Drop the temporary binning column
train_df = train_df.drop('gold_bin', axis=1)
test_df = test_df.drop('gold_bin', axis=1)

test_gold_labels = test_df['gold_labels'].tolist()

print(f"Total samples: {len(df_combined)}")
print(f"Training samples: {len(train_df)} ({len(train_df)/len(df_combined)*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/len(df_combined)*100:.1f}%)")

print(f"\nTraining set gold_avg distribution:")
print(f"  Mean: {train_df['gold_avg'].mean():.3f}")
print(f"  Std: {train_df['gold_avg'].std():.3f}")
print(f"  Min: {train_df['gold_avg'].min():.3f}")
print(f"  Max: {train_df['gold_avg'].max():.3f}")

print(f"\nTest set gold_avg distribution:")
print(f"  Mean: {test_df['gold_avg'].mean():.3f}")
print(f"  Std: {test_df['gold_avg'].std():.3f}")
print(f"  Min: {test_df['gold_avg'].min():.3f}")
print(f"  Max: {test_df['gold_avg'].max():.3f}")

print(f"\nTraining set sample IDs range: {train_df.index.min()} to {train_df.index.max()}")
print(f"Test set sample IDs range: {test_df.index.min()} to {test_df.index.max()}")

Total samples: 588
Training samples: 411 (69.9%)
Test samples: 177 (30.1%)

Training set gold_avg distribution:
  Mean: 3.109
  Std: 1.191
  Min: 1.000
  Max: 5.000

Test set gold_avg distribution:
  Mean: 3.140
  Std: 1.178
  Min: 1.000
  Max: 5.000

Training set sample IDs range: 0 to 97
Test set sample IDs range: 1 to 99


In [32]:
def prepare_data_splits(train_df, test_df, model_names):
    """
    Prepare training and test sets with specified models.

    Args:
        train_df: Training dataframe with all model predictions
        test_df: Test dataframe with all model predictions
        model_names: List of model column names to use as features

    Returns:
        X_train, y_train, X_test, y_test
    """
    X_train = train_df[model_names]
    y_train = train_df['gold_avg']

    X_test = test_df[model_names]
    y_test = test_df['gold_avg']

    # print(f"X_train shape: {X_train.shape}")
    # print(f"y_train shape: {y_train.shape}")
    # print(f"X_test shape: {X_test.shape}")
    # print(f"y_test shape: {y_test.shape}")

    # print(f"\nSelected models: {model_names}")
    # print(f"\nFirst few training samples:")
    # display(pd.concat([X_train.head(), y_train.head()], axis=1))

    return X_train, y_train, X_test, y_test

# System Evaluation

In [42]:
# Evaluate each system on test set
print("=" * 80)
print("Individual System Performance on Test Set (30%)")
print("=" * 80)
print(f"{'System':<15} {'Spearman':>12} {'Accuracy':>12} {'Correct/Total':>15}")
print("-" * 80)

system_results = {}
test_gold_labels = test_df['gold_labels'].tolist()


all_systems = [s for s in list(df_combined.columns) if not s.startswith('gold_')]
X_train, y_train, X_test, y_test = prepare_data_splits(train_df, test_df, all_systems)

for system in all_systems:
# for system in ['david_v1', 'david_v2', 'korean']:
    # Get predictions from X_test
    y_pred = X_test[system].values
    
    # Evaluate using official logic
    scores = evaluate_predictions_array(y_pred, test_gold_labels)
    system_results[system] = scores
    
# sort systems by average of spearman and accuracy
sorted_systems = sorted(system_results.items(), key=lambda x: (x[1]['spearman'] + x[1]['accuracy'])/2, reverse=True)
for system, scores in sorted_systems:
    print(f"{system:<15} {scores['spearman']:>12.4%} {scores['accuracy']:>12.4%} "
          f"{scores['correct']:>7}/{scores['total']:<7}")
print("=" * 80)

Individual System Performance on Test Set (30%)
System              Spearman     Accuracy   Correct/Total
--------------------------------------------------------------------------------
korean_v1           82.7033%     90.3955%     160/177    
korean_v2           83.0998%     89.8305%     159/177    
qwen_v2             70.8330%     80.7910%     143/177    
chatgpt_v1          74.3332%     73.4463%     130/177    
deepseek            65.3009%     76.2712%     135/177    
urdu_v2             60.1281%     70.6215%     125/177    
david_v2            60.5211%     68.3616%     121/177    
qwen_v1             63.0829%     64.4068%     114/177    
urdu_v3             41.7752%     69.4915%     123/177    
david_v1            30.8245%     61.0169%     108/177    
david_v3            20.8727%     36.1582%      64/177    


# Ensemble

## Mean

In [None]:
# Simple Mean Ensemble: y_pred = (1/N) * sum(y_i)
y_pred_mean = X_test.mean(axis=1).values

# Evaluate float version
scores_mean_float = evaluate_predictions_array(y_pred_mean, test_gold_labels)
print(f"Mean (float) - Spearman: {scores_mean_float['spearman']:.4%}, Accuracy: {scores_mean_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_mean_int = y_pred_mean.round().clip(1, 5).astype(int)
scores_mean_int = evaluate_predictions_array(y_pred_mean_int, test_gold_labels)
print(f"Mean (int)   - Spearman: {scores_mean_int['spearman']:.4%}, Accuracy: {scores_mean_int['accuracy']:.4%}")

## Weighted Mean

In [None]:
# Weighted Mean: weights learned from train set (70%) based on Spearman performance
train_gold_labels = train_df['gold_labels'].tolist()

# Calculate weights based on train set performance (Spearman correlation)
weights = []
for system in ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2']:
# for system in ['david_v1', 'david_v2', 'korean']:
    y_pred_train = X_train[system].values
    scores_train = evaluate_predictions_array(y_pred_train, train_gold_labels)
    weights.append(scores_train['spearman'])

# Convert to numpy array and normalize to sum to 1
weights = np.array(weights)
weights = weights / weights.sum()

# Apply weighted average on test set
y_pred_weighted = (X_test.values * weights).sum(axis=1)

# Evaluate float version
scores_weighted_float = evaluate_predictions_array(y_pred_weighted, test_gold_labels)
print(f"Weighted Mean (float) - Spearman: {scores_weighted_float['spearman']:.4%}, Accuracy: {scores_weighted_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_weighted_int = y_pred_weighted.round().clip(1, 5).astype(int)
scores_weighted_int = evaluate_predictions_array(y_pred_weighted_int, test_gold_labels)
print(f"Weighted Mean (int)   - Spearman: {scores_weighted_int['spearman']:.4%}, Accuracy: {scores_weighted_int['accuracy']:.4%}")

## Majority Voting

In [None]:
# Majority Voting: most common rounded prediction across systems
# Round predictions to integers (1-5 range)
X_test_rounded = X_test.round().clip(1, 5).astype(int)

# Get mode (most frequent value) for each row
y_pred_majority = stats.mode(X_test_rounded, axis=1, keepdims=False)[0]

# Evaluate
scores_majority = evaluate_predictions_array(y_pred_majority, test_gold_labels)
print(f"Majority Voting - Spearman: {scores_majority['spearman']:.4%}, Accuracy: {scores_majority['accuracy']:.4%}")

## Weighted Majority Voting

In [None]:
# Weighted Majority Voting: weighted voting using performance-based weights
# Round test predictions to integers
X_test_rounded = X_test.round().clip(1, 5).astype(int)

# Use weights learned earlier (from Weighted Mean)
y_pred_weighted_vote = []
for idx in range(len(X_test_rounded)):
    # Get predictions for this sample
    votes = X_test_rounded.iloc[idx].values
    
    # Count weighted votes for each class (1-5)
    vote_counts = {}
    for vote, weight in zip(votes, weights):
        vote_counts[vote] = vote_counts.get(vote, 0) + weight
    
    # Select class with highest weighted vote
    winner = max(vote_counts.items(), key=lambda x: x[1])[0]
    y_pred_weighted_vote.append(winner)

y_pred_weighted_vote = np.array(y_pred_weighted_vote)

# Evaluate
scores_weighted_vote = evaluate_predictions_array(y_pred_weighted_vote, test_gold_labels)
print(f"Weighted Majority Voting - Spearman: {scores_weighted_vote['spearman']:.4%}, Accuracy: {scores_weighted_vote['accuracy']:.4%}")

## Linear Stacking (Ridge Regression)

In [43]:
def train_eval_ridge(X_train, y_train, X_test, test_gold_labels, alpha=1.0):
    """
    Train and evaluate Ridge Regression ensemble.

    Args:
        X_train: Training features
        y_train: Training labels
        X_test: Test features
        test_gold_labels: Test gold labels (list of 5 ratings each)
        alpha: Ridge regularization parameter

    Returns:
        scores_float, scores_int, ridge_model
    """
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)

    # Predict on test set
    y_pred_ridge = ridge.predict(X_test)

    # Evaluate float version
    scores_float = evaluate_predictions_array(y_pred_ridge, test_gold_labels)

    # Evaluate integer version
    y_pred_int = y_pred_ridge.round().clip(1, 5).astype(int)
    scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)

    return scores_float, scores_int, ridge


def grid_search_ridge(X_train, y_train, X_test, test_gold_labels, alphas=[0.1, 1.0, 10.0], verbose=True):
    """
    Grid search over Ridge alpha values.

    Args:
        X_train, y_train, X_test, test_gold_labels: Data splits
        alphas: List of alpha values to try
        verbose: Whether to print detailed progress

    Returns:
        best_alpha, best_scores_float, best_scores_int, best_model
    """
    best_avg_score = 0
    best_alpha = None
    best_scores_float = None
    best_scores_int = None
    best_model = None

    for alpha in alphas:
        scores_float, scores_int, model = train_eval_ridge(
            X_train, y_train, X_test, test_gold_labels, alpha=alpha
        )

        avg_score = (scores_float['spearman'] + scores_float['accuracy']) / 2

        if verbose:
            print(f"   Î±={alpha:5.1f} â†’ Spearman: {scores_float['spearman']:.2%}, "
                  f"Accuracy: {scores_float['accuracy']:.2%}, "
                  f"Avg: {avg_score:.2%}")

        if avg_score > best_avg_score:
            best_avg_score = avg_score
            best_alpha = alpha
            best_scores_float = scores_float
            best_scores_int = scores_int
            best_model = model

    if verbose:
        print(f"\n   âœ“ Best: Î±={best_alpha} with average score {best_avg_score:.2%}")

    return best_alpha, best_scores_float, best_scores_int, best_model

In [47]:
def test_model_combination(combination_name, model_names, train_df, test_df, test_gold_labels, alphas=[0.1, 1.0, 10.0]):
    """
    Test a specific model combination with Ridge regression.

    Args:
        combination_name: Name of the combination (for display)
        model_names: List of model column names to use
        train_df, test_df: Data splits
        test_gold_labels: Gold labels for test set
        alphas: Alpha values to try

    Returns:
        best_alpha, best_scores_float, best_scores_int, best_model, X_train, X_test
    """
    # Prepare data
    X_train = train_df[model_names]
    y_train = train_df['gold_avg']
    X_test = test_df[model_names]
    y_test = test_df['gold_avg']

    # Run grid search
    best_alpha, best_scores_float, best_scores_int, best_model = grid_search_ridge(
        X_train, y_train, X_test, test_gold_labels, alphas=alphas, verbose=False
    )

    return best_alpha, best_scores_float, best_scores_int, best_model, X_train, X_test

def compare_all_combinations(train_df, test_df, test_gold_labels, model_combinations, alphas=[0.1, 1.0, 10.0]):
    """
    Compare all model combinations and summarize results.

    Args:
        train_df, test_df: Data splits
        test_gold_labels: Gold labels for test set
        model_combinations: Dict of {name: [model_list]}
        alphas: Alpha values to try

    Returns:
        results_summary: DataFrame with comparison results
    """
    results = []

    for combo_name, model_names in model_combinations.items():
        best_alpha, scores_float, scores_int, model, X_train, X_test = test_model_combination(
            combo_name, model_names, train_df, test_df, test_gold_labels, alphas
        )

        results.append({
            'combination': combo_name,
            'num_models': len(model_names),
            'models': ', '.join(model_names),
            'best_alpha': best_alpha,
            'spearman': scores_float['spearman'],
            'accuracy': scores_float['accuracy'],
            'avg_score': (scores_float['spearman'] + scores_float['accuracy']) / 2,
        })

    # Create summary DataFrame sorted by average score
    summary_df = pd.DataFrame(results)
    summary_df = summary_df.sort_values('avg_score', ascending=False)
    
    # Display table ranked by average
    print("\nðŸ“‹ Results (Ranked by Average Score):")
    display_df = summary_df[['combination', 'num_models', 'avg_score', 'spearman', 'accuracy', 'best_alpha']].copy()
    display_df.columns = ['Combination', '#Models', 'Avg Score', 'Spearman', 'Accuracy', 'Best Î±']
    
    # Format percentages
    for col in ['Avg Score', 'Spearman', 'Accuracy']:
        display_df[col] = display_df[col].apply(lambda x: f"{x:.2%}")
    
    display(display_df.reset_index(drop=True))

    return summary_df

In [48]:
combinations_to_compare = {}

for size in range(2, len(all_systems) + 1):  # Start from 2 to skip single models
    for combo in combinations(all_systems, size):
        # Use shortened names for cleaner output
        combo_name = '+'.join(combo)
        combinations_to_compare[combo_name] = list(combo)

summary = compare_all_combinations(
    train_df,
    test_df,
    test_gold_labels,
    combinations_to_compare,
    alphas=[0.1, 1.0, 10.0]
)


ðŸ“‹ Results (Ranked by Average Score):


Unnamed: 0,Combination,#Models,Avg Score,Spearman,Accuracy,Best Î±
0,david_v1+david_v2+korean_v1+korean_v2+qwen_v1+...,7,89.62%,84.33%,94.92%,10.0
1,david_v1+david_v2+korean_v1+korean_v2+qwen_v1+...,8,89.60%,84.28%,94.92%,10.0
2,chatgpt_v1+david_v1+david_v2+korean_v1+korean_...,8,89.60%,84.28%,94.92%,10.0
3,chatgpt_v1+david_v1+david_v2+korean_v1+korean_...,9,89.59%,84.27%,94.92%,10.0
4,chatgpt_v1+david_v1+david_v2+korean_v1+korean_...,6,89.52%,84.12%,94.92%,10.0
...,...,...,...,...,...,...
2031,david_v1+urdu_v2,2,64.38%,59.26%,69.49%,0.1
2032,david_v3+urdu_v3,2,59.85%,47.94%,71.75%,10.0
2033,david_v1+david_v3+urdu_v3,3,59.21%,47.24%,71.19%,0.1
2034,david_v1+urdu_v3,2,57.15%,46.50%,67.80%,10.0


In [49]:
print("\nâœ… Ensemble evaluation complete.")
print("\nSummary of all model combinations evaluated:")
# row 1
print(summary.to_string(index=False))


âœ… Ensemble evaluation complete.

Summary of all model combinations evaluated:
                                                                                       combination  num_models                                                                                                       models  best_alpha  spearman  accuracy  avg_score
                                     david_v1+david_v2+korean_v1+korean_v2+qwen_v1+qwen_v2+urdu_v2           7                                          david_v1, david_v2, korean_v1, korean_v2, qwen_v1, qwen_v2, urdu_v2        10.0  0.843280  0.949153   0.896216
                             david_v1+david_v2+korean_v1+korean_v2+qwen_v1+qwen_v2+urdu_v2+urdu_v3           8                                 david_v1, david_v2, korean_v1, korean_v2, qwen_v1, qwen_v2, urdu_v2, urdu_v3        10.0  0.842788  0.949153   0.895970
                          chatgpt_v1+david_v1+david_v2+korean_v1+korean_v2+qwen_v1+qwen_v2+urdu_v2           8                    

In [None]:
# Example 2: Compare all predefined combinations
# Uncomment and run to compare all combinations:

# summary = compare_all_combinations(
#     train_df,
#     test_df,
#     test_gold_labels,
#     MODEL_COMBINATIONS
# )

In [None]:
# Example 1: Test a single combination
# Uncomment and run to test just one combination:

# test_model_combination(
#     'best_5',
#     MODEL_COMBINATIONS['best_5'],
#     train_df,
#     test_df,
#     test_gold_labels
# )

In [None]:
# # Inspect Ridge Weights
# coef = ridge.coef_
# for name, w in zip(X_train.columns, coef):
#     print(f"{name:20s} {w:+.4f}")

In [50]:
# david_v1+david_v2+korean_v1+korean_v2+qwen_v1+qwen_v2+urdu_v2
best_models = [ 'david_v1', 'david_v2', 'korean_v1', 'korean_v2', 'qwen_v1', 'qwen_v2', 'urdu_v2' ]
X_train, y_train, X_test, y_test = prepare_data_splits(
    train_df,
    test_df,
    best_models
)

## XGBoost Regressor

In [51]:
def train_eval_xgb(
    X_train,
    y_train,
    X_test,
    test_gold_labels,
    *,
    n_estimators=500,
    max_depth=2,
    learning_rate=0.05,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,     # L1
    reg_lambda=10.0,   # L2
    random_state=42,
):
    xgb = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        objective="reg:squarederror",
        random_state=random_state,
        n_jobs=-1,
    )

    # Fit with early stopping (VERY IMPORTANT)
    xgb.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train)],
        verbose=False,
    )

    # Predict
    y_pred = xgb.predict(X_test)

    # Float eval
    scores_float = evaluate_predictions_array(y_pred, test_gold_labels)

    # Int eval
    y_pred_int = y_pred.round().clip(1, 5).astype(int)
    scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)

    return scores_float, scores_int


In [52]:
def grid_search_xgboost(X_train, y_train, X_test, test_gold_labels, param_grid=None):
    """
    Grid search over XGBoost hyperparameters.
    
    Args:
        X_train, y_train: Training data
        X_test, test_gold_labels: Test data
        param_grid: Dictionary of parameters to search. If None, uses default grid.
    
    Returns:
        best_params, best_scores_float, best_scores_int, best_model, results_df
    """
    from xgboost import XGBRegressor
    
    # Default parameter grid
    if param_grid is None:
        param_grid = {
            'n_estimators': [100, 200, 500],
            'max_depth': [1, 2, 3],
            'learning_rate': [0.01, 0.05, 0.1],
            'min_child_weight': [10, 50, 100],
            'subsample': [0.7, 0.8, 0.9],
            'colsample_bytree': [0.7, 0.8, 0.9],
            'reg_alpha': [0.1, 1.0, 10.0],
            'reg_lambda': [1.0, 10.0, 50.0],
        }
    
    results = []
    best_avg_score = 0
    best_params = None
    best_scores_float = None
    best_scores_int = None
    best_model = None
    
    # Generate all combinations
    import itertools
    keys = param_grid.keys()
    values = param_grid.values()
    
    total_combinations = 1
    for v in values:
        total_combinations *= len(v)
    
    print(f"Testing {total_combinations} parameter combinations...")
    
    for i, combination in enumerate(itertools.product(*values), 1):
        params = dict(zip(keys, combination))
        
        # Train XGBoost with these parameters
        xgb = XGBRegressor(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'],
            min_child_weight=params['min_child_weight'],
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            reg_alpha=params['reg_alpha'],
            reg_lambda=params['reg_lambda'],
            objective="reg:squarederror",
            random_state=0,
            n_jobs=-1,
        )
        
        xgb.fit(X_train, y_train, verbose=False)
        
        # Predict and evaluate
        y_pred = xgb.predict(X_test)
        scores_float = evaluate_predictions_array(y_pred, test_gold_labels)
        
        y_pred_int = y_pred.round().clip(1, 5).astype(int)
        scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)
        
        avg_score = (scores_float['spearman'] + scores_float['accuracy']) / 2
        
        # Store results
        results.append({
            **params,
            'spearman': scores_float['spearman'],
            'accuracy': scores_float['accuracy'],
            'avg_score': avg_score,
        })
        
        # Track best
        if avg_score > best_avg_score:
            best_avg_score = avg_score
            best_params = params
            best_scores_float = scores_float
            best_scores_int = scores_int
            best_model = xgb
        
        # Progress update every 10%
        if i % max(1, total_combinations // 10) == 0:
            print(f"  Progress: {i}/{total_combinations} ({i/total_combinations*100:.0f}%)")
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('avg_score', ascending=False)
    
    print(f"\nâœ… Best parameters found:")
    for key, val in best_params.items():
        print(f"   {key:20s} = {val}")
    print(f"\n   Avg Score: {best_avg_score:.2%}")
    print(f"   Spearman:  {best_scores_float['spearman']:.2%}")
    print(f"   Accuracy:  {best_scores_float['accuracy']:.2%}")
    
    return best_params, best_scores_float, best_scores_int, best_model, results_df


In [70]:
custom_grid = {
    'n_estimators': [500],
    'max_depth': [1],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [30, 40, 50],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'reg_alpha': [1.0, 10.0],
    'reg_lambda': [1.0, 10.0],
}

best_params, scores_float, scores_int, best_model, results_df = grid_search_xgboost(
    X_train, y_train, X_test, test_gold_labels, param_grid=custom_grid
)

# View top 10 parameter combinations
print("\nðŸ“‹ Top 10 Parameter Combinations:")
display(results_df.head(10))

Testing 144 parameter combinations...
  Progress: 14/144 (10%)
  Progress: 28/144 (19%)
  Progress: 42/144 (29%)
  Progress: 56/144 (39%)
  Progress: 70/144 (49%)
  Progress: 84/144 (58%)
  Progress: 98/144 (68%)
  Progress: 112/144 (78%)
  Progress: 126/144 (88%)
  Progress: 140/144 (97%)

âœ… Best parameters found:
   n_estimators         = 500
   max_depth            = 1
   learning_rate        = 0.1
   min_child_weight     = 50
   subsample            = 0.8
   colsample_bytree     = 0.8
   reg_alpha            = 10.0
   reg_lambda           = 1.0

   Avg Score: 89.25%
   Spearman:  83.59%
   Accuracy:  94.92%

ðŸ“‹ Top 10 Parameter Combinations:


Unnamed: 0,n_estimators,max_depth,learning_rate,min_child_weight,subsample,colsample_bytree,reg_alpha,reg_lambda,spearman,accuracy,avg_score
142,500,1,0.1,50,0.8,0.8,10.0,1.0,0.835891,0.949153,0.892522
138,500,1,0.1,50,0.8,0.7,10.0,1.0,0.835454,0.949153,0.892303
106,500,1,0.1,30,0.8,0.7,10.0,1.0,0.835336,0.949153,0.892244
63,500,1,0.05,30,0.8,0.8,10.0,10.0,0.835233,0.949153,0.892193
79,500,1,0.05,40,0.8,0.8,10.0,10.0,0.835213,0.949153,0.892183
122,500,1,0.1,40,0.8,0.7,10.0,1.0,0.83507,0.949153,0.892111
62,500,1,0.05,30,0.8,0.8,10.0,1.0,0.835001,0.949153,0.892077
95,500,1,0.05,50,0.8,0.8,10.0,10.0,0.834996,0.949153,0.892074
78,500,1,0.05,40,0.8,0.8,10.0,1.0,0.834941,0.949153,0.892047
111,500,1,0.1,30,0.8,0.8,10.0,10.0,0.834726,0.949153,0.891939


In [75]:
# XGBoost with competition-validated settings
xgb = XGBRegressor(
    n_estimators=500,          # matched to lr=0.1
    max_depth=1,               # CRITICAL: best Spearman
    learning_rate=0.05,
    min_child_weight=30,       # strong regularization
    subsample=0.7,
    colsample_bytree=0.8,
    reg_alpha=10.0,             # L1 regularization
    reg_lambda=10.0,           # L2 regularization
    objective="reg:squarederror",
    random_state=0,
    n_jobs=-1,
)

# Train on training set
xgb.fit(X_train, y_train)

# Predict on test set
y_pred_xgb = xgb.predict(X_test)

# Evaluate float version
scores_xgb_float = evaluate_predictions_array(y_pred_xgb, test_gold_labels)
print(
    f"XGBoost depth=1 (float) - "
    f"Average: {(scores_xgb_float['spearman'] + scores_xgb_float['accuracy'])/2:.4%}, "
    f"Spearman: {scores_xgb_float['spearman']:.4%}, "
    f"Accuracy: {scores_xgb_float['accuracy']:.4%}"
)

# Evaluate integer version
y_pred_xgb_int = y_pred_xgb.round().clip(1, 5).astype(int)
scores_xgb_int = evaluate_predictions_array(y_pred_xgb_int, test_gold_labels)
print(
    f"XGBoost depth=1 (int)   - "
    f"Average: {(scores_xgb_int['spearman'] + scores_xgb_int['accuracy'])/2:.4%}, "
    f"Spearman: {scores_xgb_int['spearman']:.4%}, "
    f"Accuracy: {scores_xgb_int['accuracy']:.4%}"
)


XGBoost depth=1 (float) - Average: 89.0946%, Spearman: 83.2740%, Accuracy: 94.9153%
XGBoost depth=1 (int)   - Average: 79.6153%, Spearman: 76.7448%, Accuracy: 82.4859%


In [150]:
# Inspect XGBoost Feature Importances
importance = xgb.feature_importances_
print("XGBoost Feature Importances:")
for name, imp in zip(X_train.columns, importance):
    print(f"{name:20s} {imp:.4f}")
    
# Visualize as percentages
print("\nNormalized Contributions (%):")
total_importance = importance.sum()
for name, imp in zip(X_train.columns, importance):
    print(f"{name:20s} {imp/total_importance:>7.2%}")

XGBoost Feature Importances:
david_v2             0.0884
korean_v1            0.1413
korean_v2            0.7069
urdu_v2              0.0493
qwen_v2              0.0140

Normalized Contributions (%):
david_v2               8.84%
korean_v1             14.13%
korean_v2             70.69%
urdu_v2                4.93%
qwen_v2                1.40%


## XGBoost Ranker

In [141]:
# Helper function to scale predictions back to 1-5 range
def scale_to_range(arr, low=1, upp=5):
    """Scale array values to be within the specified range [low, upp]."""
    arr_min, arr_max = arr.min(), arr.max()
    if arr_max == arr_min:
        return np.full_like(arr, (low + upp) / 2)
    arr_scaled = (arr - arr_min) / (arr_max - arr_min) * (upp - low) + low
    return arr_scaled

In [142]:
def train_eval_xgb_ranker(
    X_train,
    y_train,
    X_test,
    test_gold_labels,
    group_size=6,
    *,
    n_estimators=500,
    max_depth=2,
    learning_rate=0.05,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,     # L1
    reg_lambda=10.0,   # L2
    random_state=42,
):
    """
    Train and evaluate XGBRanker.
    
    Args:
        group_size: Number of samples per group for ranking
                   (e.g., 6 means every 6 consecutive samples form a ranking group)
    """
    # Create group information for ranking
    # Each group contains group_size samples to be ranked together
    num_groups_train = len(X_train) // group_size
    group_train = [group_size] * num_groups_train
    
    # Initialize XGBRanker
    xgb_ranker = XGBRanker(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        objective="rank:pairwise",  # Pairwise ranking objective
        random_state=random_state,
        n_jobs=-1,
    )

    # Fit with group information
    xgb_ranker.fit(
        X_train[:num_groups_train * group_size],  # Use only complete groups
        y_train[:num_groups_train * group_size],
        group=group_train,
        verbose=False,
    )

    # Predict - returns ranking scores
    y_pred_scores = xgb_ranker.predict(X_test)
    
    # Scale scores back to 1-5 range
    y_pred = scale_to_range(y_pred_scores, low=1, upp=5)

    # Float eval
    scores_float = evaluate_predictions_array(y_pred, test_gold_labels)

    # Int eval
    y_pred_int = y_pred.round().clip(1, 5).astype(int)
    scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)

    return scores_float, scores_int, xgb_ranker

In [143]:
# Grid search over XGBoost Ranker hyperparameters
print("=" * 80)
print("XGBoost Ranker Hyperparameter Search")
print("=" * 80)

configs_ranker = [
    dict(learning_rate=0.1,  n_estimators=100, group_size=6),
    dict(learning_rate=0.05, n_estimators=200, group_size=6),
    dict(learning_rate=0.02, n_estimators=400, group_size=6),
]

for depth in [1, 2, 3]:
    print(f"\nDepth={depth}")
    print("-" * 80)
    for cfg in configs_ranker:
        scores_float, scores_int, _ = train_eval_xgb_ranker(
            X_train,
            y_train,
            X_test,
            test_gold_labels,
            max_depth=depth,
            **cfg
        )

        print(
            f"  lr={cfg['learning_rate']:4.2f} n={cfg['n_estimators']:3d} group={cfg['group_size']} | "
            f"Spearman={scores_float['spearman']:.4%} Acc={scores_float['accuracy']:.4%}"
        )

XGBoost Ranker Hyperparameter Search

Depth=1
--------------------------------------------------------------------------------
  lr=0.10 n=100 group=6 | Spearman=83.3431% Acc=92.0904%
  lr=0.05 n=200 group=6 | Spearman=83.5367% Acc=93.7853%
  lr=0.02 n=400 group=6 | Spearman=83.7295% Acc=92.6554%

Depth=2
--------------------------------------------------------------------------------
  lr=0.10 n=100 group=6 | Spearman=82.9971% Acc=88.7006%
  lr=0.05 n=200 group=6 | Spearman=82.9917% Acc=90.9605%
  lr=0.02 n=400 group=6 | Spearman=83.0810% Acc=89.8305%

Depth=3
--------------------------------------------------------------------------------
  lr=0.10 n=100 group=6 | Spearman=82.6533% Acc=90.3955%
  lr=0.05 n=200 group=6 | Spearman=83.0643% Acc=89.2655%
  lr=0.02 n=400 group=6 | Spearman=82.8186% Acc=89.2655%


In [144]:
# Train final XGBoost Ranker with best settings
print("=" * 80)
print("Final XGBoost Ranker Model")
print("=" * 80)

xgb_ranker = XGBRanker(
    n_estimators=400,
    max_depth=1,
    learning_rate=0.05,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=10.0,
    objective="rank:pairwise",
    n_jobs=-1,
)

# Create groups (samples per group)
group_size = 6
num_groups_train = len(X_train) // group_size
group_train = [group_size] * num_groups_train

# Train on complete groups only
X_train_grouped = X_train[:num_groups_train * group_size]
y_train_grouped = y_train[:num_groups_train * group_size]

xgb_ranker.fit(X_train_grouped, y_train_grouped, group=group_train)

# Predict on test set
y_pred_ranker_scores = xgb_ranker.predict(X_test)
y_pred_ranker = scale_to_range(y_pred_ranker_scores, low=1, upp=5)

# Evaluate float version
scores_ranker_float = evaluate_predictions_array(y_pred_ranker, test_gold_labels)
print(f"XGBoost Ranker (float) - Average: {(scores_ranker_float['spearman'] + scores_ranker_float['accuracy'])/2:.4%}, "
      f"Spearman: {scores_ranker_float['spearman']:.4%}, Accuracy: {scores_ranker_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_ranker_int = y_pred_ranker.round().clip(1, 5).astype(int)
scores_ranker_int = evaluate_predictions_array(y_pred_ranker_int, test_gold_labels)
print(f"XGBoost Ranker (int)   - Average: {(scores_ranker_int['spearman'] + scores_ranker_int['accuracy'])/2:.4%}, "
      f"Spearman: {scores_ranker_int['spearman']:.4%}, Accuracy: {scores_ranker_int['accuracy']:.4%}")

Final XGBoost Ranker Model
XGBoost Ranker (float) - Average: 88.6203%, Spearman: 83.4553%, Accuracy: 93.7853%
XGBoost Ranker (int)   - Average: 83.7981%, Spearman: 81.1555%, Accuracy: 86.4407%


In [145]:
# Inspect XGBoost Ranker Feature Importances
importance_ranker = xgb_ranker.feature_importances_
print("XGBoost Ranker Feature Importances:")
for name, imp in zip(X_train.columns, importance_ranker):
    print(f"{name:20s} {imp:.4f}")
    
# Visualize as percentages
print("\nNormalized Contributions (%):")
total_importance_ranker = importance_ranker.sum()
for name, imp in zip(X_train.columns, importance_ranker):
    print(f"{name:20s} {imp/total_importance_ranker:>7.2%}")

XGBoost Ranker Feature Importances:
david_v2             0.1766
korean_v1            0.2209
korean_v2            0.2235
urdu_v2              0.1518
qwen_v2              0.2271

Normalized Contributions (%):
david_v2              17.66%
korean_v1             22.09%
korean_v2             22.35%
urdu_v2               15.18%
qwen_v2               22.71%
