In [74]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'Author'
__email__ = 'Email'

# SemEval 2026 Task 5 - Ensemble

In [75]:
# dependency
# built-in
import json
import os
import sys
from pathlib import Path

# third-party
import random
import pandas as pd
import numpy as np
from scipy import stats
from xgboost import XGBRanker, XGBRegressor
from scipy.stats import spearmanr
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# local - add src/eval to path for importing evaluation functions
sys.path.insert(0, str(Path('../src/eval').resolve()))
# Import evaluation functions from src/eval/scoring.py
import scoring

%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Init

In [76]:
# helper
def set_seed(seed=42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)

def load_predictions(filepath):
    """Load predictions from a JSONL file into a dictionary."""
    predictions = {}
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            predictions[data['id']] = data['prediction']
    return predictions

# Evaluation functions following the structure of src/eval/scoring.py
def evaluate_predictions_array(y_pred, y_true_labels):
    """
    Evaluate predictions using the same logic as scoring.py
    
    Args:
        y_pred: array of predictions
        y_true_labels: list of gold label lists (5 ratings each)
    
    Returns:
        dict with spearman and accuracy scores
    """
    # Build prediction and gold lists (same structure as scoring.py)
    pred_list = list(y_pred)
    gold_list = [scoring.get_average(labels) for labels in y_true_labels]
    
    # Calculate Spearman correlation (same as scoring.py)
    corr, p_value = spearmanr(pred_list, gold_list)
    
    # Calculate accuracy within SD (same logic as scoring.py)
    correct_guesses = 0
    wrong_guesses = 0
    
    for pred, labels in zip(pred_list, y_true_labels):
        if scoring.is_within_standard_deviation(pred, labels):
            correct_guesses += 1
        else:
            wrong_guesses += 1
    
    accuracy = correct_guesses / (correct_guesses + wrong_guesses)
    
    return {
        'spearman': corr,
        'p_value': p_value,
        'accuracy': accuracy,
        'correct': correct_guesses,
        'total': correct_guesses + wrong_guesses
    }

In [77]:
# init
set_seed(0)

## Data

In [78]:
# Load gold labels (solution file)
SOLUTION_FILE = Path("../res/data/dev_solution.jsonl")

gold_labels = {}
with open(SOLUTION_FILE, 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        gold_labels[data['id']] = data['label']

print(f"Loaded {len(gold_labels)} gold labels")
print(f"\nGold labels are lists of 5 human ratings (1-5 scale)")
print(f"\nExample gold labels:")
for i in range(5):
    sample_id = str(i)
    print(f"  ID {sample_id}: {gold_labels[sample_id]} (avg={np.mean(gold_labels[sample_id]):.2f}, std={np.std(gold_labels[sample_id], ddof=1):.2f})")

Loaded 588 gold labels

Gold labels are lists of 5 human ratings (1-5 scale)

Example gold labels:
  ID 0: [4, 5, 3, 1, 5] (avg=3.60, std=1.67)
  ID 1: [3, 3, 4, 4, 4] (avg=3.60, std=0.55)
  ID 2: [5, 5, 2, 3, 4] (avg=3.80, std=1.30)
  ID 3: [4, 5, 4, 3, 5] (avg=4.20, std=0.84)
  ID 4: [1, 5, 4, 4, 1] (avg=3.00, std=1.87)


## System

In [79]:
# Path to individual system outputs
RESULTS_DIR = Path("../res/results/dev/")

# Get all jsonl files
prediction_files = sorted(RESULTS_DIR.glob("*.jsonl"))
print(f"Found {len(prediction_files)} prediction files:")
for f in prediction_files:
    print(f"  - {f.name}")

Found 7 prediction files:
  - chatgpt.jsonl
  - david_v1.jsonl
  - david_v2.jsonl
  - korean.jsonl
  - urdu_v1.jsonl
  - urdu_v2.jsonl
  - urdu_v3.jsonl


In [80]:
# Load all predictions
all_predictions = {}
for pred_file in prediction_files:
    system_name = pred_file.stem  # filename without extension
    all_predictions[system_name] = load_predictions(pred_file)
    print(f"Loaded {len(all_predictions[system_name])} predictions from {system_name}")

# Convert to DataFrame for easier manipulation
# Each row is a sample, each column is a system's prediction
df_predictions = pd.DataFrame(all_predictions)
df_predictions.index.name = 'id'

print(f"\nPredictions DataFrame shape: {df_predictions.shape}")
print(f"Number of samples: {len(df_predictions)}")
print(f"Number of systems: {len(df_predictions.columns)}")
print(f"\nSystems: {list(df_predictions.columns)}")
print(f"\nSample data types:")
print(df_predictions.dtypes)
print(f"\nFirst 10 predictions:")
df_predictions.head(10)

Loaded 588 predictions from chatgpt
Loaded 588 predictions from david_v1
Loaded 588 predictions from david_v2
Loaded 588 predictions from korean
Loaded 588 predictions from urdu_v1
Loaded 588 predictions from urdu_v2
Loaded 588 predictions from urdu_v3

Predictions DataFrame shape: (588, 7)
Number of samples: 588
Number of systems: 7

Systems: ['chatgpt', 'david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2', 'urdu_v3']

Sample data types:
chatgpt       int64
david_v1      int64
david_v2    float64
korean      float64
urdu_v1       int64
urdu_v2       int64
urdu_v3       int64
dtype: object

First 10 predictions:


Unnamed: 0_level_0,chatgpt,david_v1,david_v2,korean,urdu_v1,urdu_v2,urdu_v3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,5,4,4.06,4.360252,4,4,4
1,2,4,1.94,2.586742,3,2,3
2,3,5,3.9976,2.647276,4,3,4
3,2,5,2.0024,3.392001,4,4,4
4,3,5,3.548,2.816402,4,4,4
5,2,4,2.452,3.373284,4,3,4
6,5,4,4.5992,4.360252,2,4,2
7,1,1,1.4008,1.942182,2,3,2
8,1,3,2.8696,1.942182,2,2,2
9,3,3,3.1304,3.45069,4,5,4


In [81]:
# Create a combined DataFrame with predictions and gold labels
df_gold = pd.DataFrame({
    'gold_labels': gold_labels,
    'gold_avg': {k: np.mean(v) for k, v in gold_labels.items()},
    'gold_std': {k: np.std(v, ddof=1) for k, v in gold_labels.items()}
})

# Combine predictions with gold labels
df_combined = df_predictions.join(df_gold)

print(f"Combined DataFrame shape: {df_combined.shape}")
print(f"\nColumns: {list(df_combined.columns)}")
print(f"\nFirst 10 rows:")
df_combined.head(10)

Combined DataFrame shape: (588, 10)

Columns: ['chatgpt', 'david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2', 'urdu_v3', 'gold_labels', 'gold_avg', 'gold_std']

First 10 rows:


Unnamed: 0_level_0,chatgpt,david_v1,david_v2,korean,urdu_v1,urdu_v2,urdu_v3,gold_labels,gold_avg,gold_std
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5,4,4.06,4.360252,4,4,4,"[4, 5, 3, 1, 5]",3.6,1.67332
1,2,4,1.94,2.586742,3,2,3,"[3, 3, 4, 4, 4]",3.6,0.547723
2,3,5,3.9976,2.647276,4,3,4,"[5, 5, 2, 3, 4]",3.8,1.30384
3,2,5,2.0024,3.392001,4,4,4,"[4, 5, 4, 3, 5]",4.2,0.83666
4,3,5,3.548,2.816402,4,4,4,"[1, 5, 4, 4, 1]",3.0,1.870829
5,2,4,2.452,3.373284,4,3,4,"[4, 3, 4, 1, 3]",3.0,1.224745
6,5,4,4.5992,4.360252,2,4,2,"[4, 4, 5, 5, 5]",4.6,0.547723
7,1,1,1.4008,1.942182,2,3,2,"[1, 1, 1, 2, 2, 1]",1.333333,0.516398
8,1,3,2.8696,1.942182,2,2,2,"[4, 1, 1, 2, 3]",2.2,1.30384
9,3,3,3.1304,3.45069,4,5,4,"[4, 2, 5, 4, 4]",3.8,1.095445


# Split Data

In [82]:
# Use stratification based on binned gold_avg to ensure balanced distribution
df_combined['gold_bin'] = pd.cut(df_combined['gold_avg'], bins=5, labels=False)

train_df, test_df = train_test_split(
    df_combined, 
    test_size=0.3, 
    random_state=42,
    stratify=df_combined['gold_bin']
)

# Drop the temporary binning column
train_df = train_df.drop('gold_bin', axis=1)
test_df = test_df.drop('gold_bin', axis=1)

print(f"Total samples: {len(df_combined)}")
print(f"Training samples: {len(train_df)} ({len(train_df)/len(df_combined)*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/len(df_combined)*100:.1f}%)")

print(f"\nTraining set gold_avg distribution:")
print(f"  Mean: {train_df['gold_avg'].mean():.3f}")
print(f"  Std: {train_df['gold_avg'].std():.3f}")
print(f"  Min: {train_df['gold_avg'].min():.3f}")
print(f"  Max: {train_df['gold_avg'].max():.3f}")

print(f"\nTest set gold_avg distribution:")
print(f"  Mean: {test_df['gold_avg'].mean():.3f}")
print(f"  Std: {test_df['gold_avg'].std():.3f}")
print(f"  Min: {test_df['gold_avg'].min():.3f}")
print(f"  Max: {test_df['gold_avg'].max():.3f}")

print(f"\nTraining set sample IDs range: {train_df.index.min()} to {train_df.index.max()}")
print(f"Test set sample IDs range: {test_df.index.min()} to {test_df.index.max()}")

Total samples: 588
Training samples: 411 (69.9%)
Test samples: 177 (30.1%)

Training set gold_avg distribution:
  Mean: 3.109
  Std: 1.191
  Min: 1.000
  Max: 5.000

Test set gold_avg distribution:
  Mean: 3.140
  Std: 1.178
  Min: 1.000
  Max: 5.000

Training set sample IDs range: 0 to 97
Test set sample IDs range: 1 to 99


In [83]:
# Prepare training and test sets
# X = system predictions (features), y = gold average (target)

X_train = train_df[['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2']]
# X_train = train_df[['david_v1', 'david_v2', 'korean']]
y_train = train_df['gold_avg']

X_test = test_df[['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2']]
# X_test = test_df[['david_v1', 'david_v2', 'korean']]
y_test = test_df['gold_avg']

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"\nFirst few training samples:")
pd.concat([X_train.head(), y_train.head()], axis=1)

X_train shape: (411, 5)
y_train shape: (411,)
X_test shape: (177, 5)
y_test shape: (177,)

First few training samples:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v1,urdu_v2,gold_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
267,2,1.008,1.942182,4,2,1.2
294,5,4.9928,4.360252,4,5,4.4
65,4,2.25,2.630037,2,4,1.8
191,5,4.2396,4.397248,4,5,4.0
391,2,1.0592,2.008239,2,3,1.4


# System Evaluation

In [85]:
# Evaluate each system on test set
print("=" * 80)
print("Individual System Performance on Test Set (30%)")
print("=" * 80)
print(f"{'System':<15} {'Spearman':>12} {'Accuracy':>12} {'Correct/Total':>15}")
print("-" * 80)

system_results = {}
test_gold_labels = test_df['gold_labels'].tolist()

for system in ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2']:
# for system in ['david_v1', 'david_v2', 'korean']:
    # Get predictions from X_test
    y_pred = X_test[system].values
    
    # Evaluate using official logic
    scores = evaluate_predictions_array(y_pred, test_gold_labels)
    system_results[system] = scores
    
    print(f"{system:<15} {scores['spearman']:>12.4%} {scores['accuracy']:>12.4%} "
          f"{scores['correct']:>7}/{scores['total']:<7}")

print("=" * 80)

Individual System Performance on Test Set (30%)
System              Spearman     Accuracy   Correct/Total
--------------------------------------------------------------------------------
david_v1            30.8245%     61.0169%     108/177    
david_v2            60.5211%     68.3616%     121/177    
korean              82.7033%     90.3955%     160/177    
urdu_v1             43.0128%     69.4915%     123/177    
urdu_v2             60.1281%     70.6215%     125/177    


# Ensemble

## Mean

In [66]:
# Simple Mean Ensemble: y_pred = (1/N) * sum(y_i)
y_pred_mean = X_test.mean(axis=1).values

# Evaluate float version
scores_mean_float = evaluate_predictions_array(y_pred_mean, test_gold_labels)
print(f"Mean (float) - Spearman: {scores_mean_float['spearman']:.4%}, Accuracy: {scores_mean_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_mean_int = y_pred_mean.round().clip(1, 5).astype(int)
scores_mean_int = evaluate_predictions_array(y_pred_mean_int, test_gold_labels)
print(f"Mean (int)   - Spearman: {scores_mean_int['spearman']:.4%}, Accuracy: {scores_mean_int['accuracy']:.4%}")

Mean (float) - Spearman: 77.0697%, Accuracy: 73.4463%
Mean (int)   - Spearman: 66.4454%, Accuracy: 67.7966%


## Weighted Mean

In [67]:
# Weighted Mean: weights learned from train set (70%) based on Spearman performance
train_gold_labels = train_df['gold_labels'].tolist()

# Calculate weights based on train set performance (Spearman correlation)
weights = []
for system in ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2']:
# for system in ['david_v1', 'david_v2', 'korean']:
    y_pred_train = X_train[system].values
    scores_train = evaluate_predictions_array(y_pred_train, train_gold_labels)
    weights.append(scores_train['spearman'])

# Convert to numpy array and normalize to sum to 1
weights = np.array(weights)
weights = weights / weights.sum()

# Apply weighted average on test set
y_pred_weighted = (X_test.values * weights).sum(axis=1)

# Evaluate float version
scores_weighted_float = evaluate_predictions_array(y_pred_weighted, test_gold_labels)
print(f"Weighted Mean (float) - Spearman: {scores_weighted_float['spearman']:.4%}, Accuracy: {scores_weighted_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_weighted_int = y_pred_weighted.round().clip(1, 5).astype(int)
scores_weighted_int = evaluate_predictions_array(y_pred_weighted_int, test_gold_labels)
print(f"Weighted Mean (int)   - Spearman: {scores_weighted_int['spearman']:.4%}, Accuracy: {scores_weighted_int['accuracy']:.4%}")

Weighted Mean (float) - Spearman: 80.5767%, Accuracy: 71.7514%
Weighted Mean (int)   - Spearman: 71.1430%, Accuracy: 67.2316%


## Majority Voting

In [68]:
# Majority Voting: most common rounded prediction across systems
# Round predictions to integers (1-5 range)
X_test_rounded = X_test.round().clip(1, 5).astype(int)

# Get mode (most frequent value) for each row
y_pred_majority = stats.mode(X_test_rounded, axis=1, keepdims=False)[0]

# Evaluate
scores_majority = evaluate_predictions_array(y_pred_majority, test_gold_labels)
print(f"Majority Voting - Spearman: {scores_majority['spearman']:.4%}, Accuracy: {scores_majority['accuracy']:.4%}")

Majority Voting - Spearman: 59.0033%, Accuracy: 74.5763%


## Weighted Majority Voting

In [69]:
# Weighted Majority Voting: weighted voting using performance-based weights
# Round test predictions to integers
X_test_rounded = X_test.round().clip(1, 5).astype(int)

# Use weights learned earlier (from Weighted Mean)
y_pred_weighted_vote = []
for idx in range(len(X_test_rounded)):
    # Get predictions for this sample
    votes = X_test_rounded.iloc[idx].values
    
    # Count weighted votes for each class (1-5)
    vote_counts = {}
    for vote, weight in zip(votes, weights):
        vote_counts[vote] = vote_counts.get(vote, 0) + weight
    
    # Select class with highest weighted vote
    winner = max(vote_counts.items(), key=lambda x: x[1])[0]
    y_pred_weighted_vote.append(winner)

y_pred_weighted_vote = np.array(y_pred_weighted_vote)

# Evaluate
scores_weighted_vote = evaluate_predictions_array(y_pred_weighted_vote, test_gold_labels)
print(f"Weighted Majority Voting - Spearman: {scores_weighted_vote['spearman']:.4%}, Accuracy: {scores_weighted_vote['accuracy']:.4%}")

Weighted Majority Voting - Spearman: 69.6016%, Accuracy: 78.5311%


## Linear Stacking (Ridge Regression)

In [86]:
# Linear Stacking: learn weights via Ridge Regression on train set
for alpha in [0.1, 1.0, 10.0]:
    print("=" * 80)
    print(f"Ridge Regression Stacking (alpha={alpha})")
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train, y_train)

    # Predict on test set
    y_pred_ridge = ridge.predict(X_test)

    # Evaluate float version
    scores_ridge_float = evaluate_predictions_array(y_pred_ridge, test_gold_labels)
    print(f"Ridge (float) - Spearman: {scores_ridge_float['spearman']:.4%}, Accuracy: {scores_ridge_float['accuracy']:.4%}")

    # Evaluate integer version
    y_pred_ridge_int = y_pred_ridge.round().clip(1, 5).astype(int)
    scores_ridge_int = evaluate_predictions_array(y_pred_ridge_int, test_gold_labels)
    print(f"Ridge (int)   - Spearman: {scores_ridge_int['spearman']:.4%}, Accuracy: {scores_ridge_int['accuracy']:.4%}")

Ridge Regression Stacking (alpha=0.1)
Ridge (float) - Spearman: 84.5721%, Accuracy: 91.5254%
Ridge (int)   - Spearman: 81.5111%, Accuracy: 84.7458%
Ridge Regression Stacking (alpha=1.0)
Ridge (float) - Spearman: 84.5721%, Accuracy: 91.5254%
Ridge (int)   - Spearman: 81.5111%, Accuracy: 84.7458%
Ridge Regression Stacking (alpha=10.0)
Ridge (float) - Spearman: 84.5721%, Accuracy: 91.5254%
Ridge (int)   - Spearman: 81.5111%, Accuracy: 84.7458%


In [87]:
# Inspect Ridge Weights
coef = ridge.coef_
for name, w in zip(X_train.columns, coef):
    print(f"{name:20s} {w:+.4f}")

david_v1             +0.1391
david_v2             +0.1424
korean               +0.8260
urdu_v1              +0.0404
urdu_v2              +0.0683


## XGBoost Regressor

In [88]:
def train_eval_xgb(
    X_train,
    y_train,
    X_test,
    test_gold_labels,
    *,
    n_estimators=500,
    max_depth=2,
    learning_rate=0.05,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,     # L1
    reg_lambda=10.0,   # L2
    random_state=42,
):
    xgb = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        objective="reg:squarederror",
        random_state=random_state,
        n_jobs=-1,
    )

    # Fit with early stopping (VERY IMPORTANT)
    xgb.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train)],
        verbose=False,
    )

    # Predict
    y_pred = xgb.predict(X_test)

    # Float eval
    scores_float = evaluate_predictions_array(y_pred, test_gold_labels)

    # Int eval
    y_pred_int = y_pred.round().clip(1, 5).astype(int)
    scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)

    return scores_float, scores_int


In [89]:
depths = [1, 2, 3]

configs = [
    dict(learning_rate=0.1,  n_estimators=100),
    dict(learning_rate=0.05, n_estimators=200),
    dict(learning_rate=0.02, n_estimators=400),
]

regularizers = [
    dict(reg_alpha=0.0, reg_lambda=1.0),
    dict(reg_alpha=1.0, reg_lambda=10.0),
    dict(reg_alpha=10.0, reg_lambda=50.0),
]

for depth in [1, 2, 3]:
    for cfg in configs:
        scores_float, scores_int = train_eval_xgb(
            X_train,
            y_train,
            X_test,
            test_gold_labels,
            max_depth=depth,
            **cfg
        )

        print(
            f"depth={depth} lr={cfg['learning_rate']} n={cfg['n_estimators']} | "
            f"Spearman={scores_float['spearman']:.4%} "
            f"Acc={scores_float['accuracy']:.4%}"
        )


depth=1 lr=0.1 n=100 | Spearman=83.3860% Acc=93.2203%
depth=1 lr=0.05 n=200 | Spearman=83.1523% Acc=92.6554%
depth=1 lr=0.02 n=400 | Spearman=83.1028% Acc=93.7853%
depth=2 lr=0.1 n=100 | Spearman=82.9433% Acc=91.5254%
depth=2 lr=0.05 n=200 | Spearman=82.8727% Acc=91.5254%
depth=2 lr=0.02 n=400 | Spearman=83.0162% Acc=92.6554%
depth=3 lr=0.1 n=100 | Spearman=82.0758% Acc=92.0904%
depth=3 lr=0.05 n=200 | Spearman=82.2284% Acc=91.5254%
depth=3 lr=0.02 n=400 | Spearman=82.5424% Acc=91.5254%


In [90]:
# XGBoost with competition-validated settings
xgb = XGBRegressor(
    n_estimators=100,          # matched to lr=0.1
    max_depth=1,               # CRITICAL: best Spearman
    learning_rate=0.1,
    min_child_weight=50,       # strong regularization
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,             # L1 regularization
    reg_lambda=10.0,           # L2 regularization
    objective="reg:squarederror",
    random_state=0,
    n_jobs=-1,
)

# Train on training set
xgb.fit(X_train, y_train)

# Predict on test set
y_pred_xgb = xgb.predict(X_test)

# Evaluate float version
scores_xgb_float = evaluate_predictions_array(y_pred_xgb, test_gold_labels)
print(
    f"XGBoost depth=1 (float) - "
    f"Spearman: {scores_xgb_float['spearman']:.4%}, "
    f"Accuracy: {scores_xgb_float['accuracy']:.4%}"
)

# Evaluate integer version
y_pred_xgb_int = y_pred_xgb.round().clip(1, 5).astype(int)
scores_xgb_int = evaluate_predictions_array(y_pred_xgb_int, test_gold_labels)
print(
    f"XGBoost depth=1 (int)   - "
    f"Spearman: {scores_xgb_int['spearman']:.4%}, "
    f"Accuracy: {scores_xgb_int['accuracy']:.4%}"
)


XGBoost depth=1 (float) - Spearman: 83.1761%, Accuracy: 94.3503%
XGBoost depth=1 (int)   - Spearman: 80.3693%, Accuracy: 85.8757%


In [91]:
# Inspect XGBoost Feature Importances
importance = xgb.feature_importances_
print("XGBoost Feature Importances:")
for name, imp in zip(X_train.columns, importance):
    print(f"{name:20s} {imp:.4f}")
    
# Visualize as percentages
print("\nNormalized Contributions (%):")
total_importance = importance.sum()
for name, imp in zip(X_train.columns, importance):
    print(f"{name:20s} {imp/total_importance:>7.2%}")

XGBoost Feature Importances:
david_v1             0.0657
david_v2             0.2308
korean               0.6123
urdu_v1              0.0251
urdu_v2              0.0661

Normalized Contributions (%):
david_v1               6.57%
david_v2              23.08%
korean                61.23%
urdu_v1                2.51%
urdu_v2                6.61%


## XGBoost Ranker

In [92]:
# Helper function to scale predictions back to 1-5 range
def scale_to_range(arr, low=1, upp=5):
    """Scale array values to be within the specified range [low, upp]."""
    arr_min, arr_max = arr.min(), arr.max()
    if arr_max == arr_min:
        return np.full_like(arr, (low + upp) / 2)
    arr_scaled = (arr - arr_min) / (arr_max - arr_min) * (upp - low) + low
    return arr_scaled

In [93]:
def train_eval_xgb_ranker(
    X_train,
    y_train,
    X_test,
    test_gold_labels,
    group_size=6,
    *,
    n_estimators=500,
    max_depth=2,
    learning_rate=0.05,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,     # L1
    reg_lambda=10.0,   # L2
    random_state=42,
):
    """
    Train and evaluate XGBRanker.
    
    Args:
        group_size: Number of samples per group for ranking
                   (e.g., 6 means every 6 consecutive samples form a ranking group)
    """
    # Create group information for ranking
    # Each group contains group_size samples to be ranked together
    num_groups_train = len(X_train) // group_size
    group_train = [group_size] * num_groups_train
    
    # Initialize XGBRanker
    xgb_ranker = XGBRanker(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        objective="rank:pairwise",  # Pairwise ranking objective
        random_state=random_state,
        n_jobs=-1,
    )

    # Fit with group information
    xgb_ranker.fit(
        X_train[:num_groups_train * group_size],  # Use only complete groups
        y_train[:num_groups_train * group_size],
        group=group_train,
        verbose=False,
    )

    # Predict - returns ranking scores
    y_pred_scores = xgb_ranker.predict(X_test)
    
    # Scale scores back to 1-5 range
    y_pred = scale_to_range(y_pred_scores, low=1, upp=5)

    # Float eval
    scores_float = evaluate_predictions_array(y_pred, test_gold_labels)

    # Int eval
    y_pred_int = y_pred.round().clip(1, 5).astype(int)
    scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)

    return scores_float, scores_int, xgb_ranker

In [105]:
# Grid search over XGBoost Ranker hyperparameters
print("=" * 80)
print("XGBoost Ranker Hyperparameter Search")
print("=" * 80)

configs_ranker = [
    dict(learning_rate=0.1,  n_estimators=100, group_size=9),
    dict(learning_rate=0.05, n_estimators=200, group_size=9),
    dict(learning_rate=0.02, n_estimators=400, group_size=9),
]

for depth in [1, 2, 3]:
    print(f"\nDepth={depth}")
    print("-" * 80)
    for cfg in configs_ranker:
        scores_float, scores_int, _ = train_eval_xgb_ranker(
            X_train,
            y_train,
            X_test,
            test_gold_labels,
            max_depth=depth,
            **cfg
        )

        print(
            f"  lr={cfg['learning_rate']:4.2f} n={cfg['n_estimators']:3d} group={cfg['group_size']} | "
            f"Spearman={scores_float['spearman']:.4%} Acc={scores_float['accuracy']:.4%}"
        )

XGBoost Ranker Hyperparameter Search

Depth=1
--------------------------------------------------------------------------------
  lr=0.10 n=100 group=9 | Spearman=83.2733% Acc=93.2203%
  lr=0.05 n=200 group=9 | Spearman=82.8201% Acc=92.6554%
  lr=0.02 n=400 group=9 | Spearman=82.7442% Acc=92.6554%

Depth=2
--------------------------------------------------------------------------------
  lr=0.10 n=100 group=9 | Spearman=83.6578% Acc=91.5254%
  lr=0.05 n=200 group=9 | Spearman=83.6267% Acc=92.6554%
  lr=0.02 n=400 group=9 | Spearman=83.5746% Acc=91.5254%

Depth=3
--------------------------------------------------------------------------------
  lr=0.10 n=100 group=9 | Spearman=83.0706% Acc=90.9605%
  lr=0.05 n=200 group=9 | Spearman=83.1426% Acc=92.0904%
  lr=0.02 n=400 group=9 | Spearman=83.5755% Acc=92.0904%


In [110]:
# Train final XGBoost Ranker with best settings
print("=" * 80)
print("Final XGBoost Ranker Model")
print("=" * 80)

xgb_ranker = XGBRanker(
    n_estimators=100,
    max_depth=1,
    learning_rate=0.1,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=10.0,
    objective="rank:pairwise",
    n_jobs=-1,
)

# Create groups (samples per group)
group_size = 9
num_groups_train = len(X_train) // group_size
group_train = [group_size] * num_groups_train

# Train on complete groups only
X_train_grouped = X_train[:num_groups_train * group_size]
y_train_grouped = y_train[:num_groups_train * group_size]

xgb_ranker.fit(X_train_grouped, y_train_grouped, group=group_train)

# Predict on test set
y_pred_ranker_scores = xgb_ranker.predict(X_test)
y_pred_ranker = scale_to_range(y_pred_ranker_scores, low=1, upp=5)

# Evaluate float version
scores_ranker_float = evaluate_predictions_array(y_pred_ranker, test_gold_labels)
print(f"XGBoost Ranker (float) - Spearman: {scores_ranker_float['spearman']:.4%}, Accuracy: {scores_ranker_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_ranker_int = y_pred_ranker.round().clip(1, 5).astype(int)
scores_ranker_int = evaluate_predictions_array(y_pred_ranker_int, test_gold_labels)
print(f"XGBoost Ranker (int)   - Spearman: {scores_ranker_int['spearman']:.4%}, Accuracy: {scores_ranker_int['accuracy']:.4%}")

Final XGBoost Ranker Model
XGBoost Ranker (float) - Spearman: 82.7531%, Accuracy: 94.3503%
XGBoost Ranker (int)   - Spearman: 79.8188%, Accuracy: 85.3107%


In [72]:
# Inspect XGBoost Ranker Feature Importances
importance_ranker = xgb_ranker.feature_importances_
print("XGBoost Ranker Feature Importances:")
for name, imp in zip(X_train.columns, importance_ranker):
    print(f"{name:20s} {imp:.4f}")
    
# Visualize as percentages
print("\nNormalized Contributions (%):")
total_importance_ranker = importance_ranker.sum()
for name, imp in zip(X_train.columns, importance_ranker):
    print(f"{name:20s} {imp/total_importance_ranker:>7.2%}")

XGBoost Ranker Feature Importances:
david_v1             0.1607
david_v2             0.1645
korean               0.2235
urdu_v1              0.0629
urdu_v2              0.1322
chatgpt              0.2561

Normalized Contributions (%):
david_v1              16.07%
david_v2              16.45%
korean                22.35%
urdu_v1                6.29%
urdu_v2               13.22%
chatgpt               25.61%
