In [549]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'Author'
__email__ = 'Email'

# SemEval 2026 Task 5 - Ensemble

In [550]:
# dependency
# built-in
import json
import os
import sys
from pathlib import Path

# third-party
import random
import pandas as pd
import numpy as np
from scipy import stats
from xgboost import XGBRanker
from scipy.stats import spearmanr
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# local - add src/eval to path for importing evaluation functions
sys.path.insert(0, str(Path('../src/eval').resolve()))
# Import evaluation functions from src/eval/scoring.py
import scoring

%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Input

In [551]:
# helper
def set_seed(seed=42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)

def load_predictions(filepath):
    """Load predictions from a JSONL file into a dictionary."""
    predictions = {}
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            predictions[data['id']] = data['prediction']
    return predictions

In [552]:
# init
set_seed(0)

## Data

In [553]:
# Load gold labels (solution file)
SOLUTION_FILE = Path("../res/data/dev_solution.jsonl")

gold_labels = {}
with open(SOLUTION_FILE, 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        gold_labels[data['id']] = data['label']

print(f"Loaded {len(gold_labels)} gold labels")
print(f"\nGold labels are lists of 5 human ratings (1-5 scale)")
print(f"\nExample gold labels:")
for i in range(5):
    sample_id = str(i)
    print(f"  ID {sample_id}: {gold_labels[sample_id]} (avg={np.mean(gold_labels[sample_id]):.2f}, std={np.std(gold_labels[sample_id], ddof=1):.2f})")

Loaded 588 gold labels

Gold labels are lists of 5 human ratings (1-5 scale)

Example gold labels:
  ID 0: [4, 5, 3, 1, 5] (avg=3.60, std=1.67)
  ID 1: [3, 3, 4, 4, 4] (avg=3.60, std=0.55)
  ID 2: [5, 5, 2, 3, 4] (avg=3.80, std=1.30)
  ID 3: [4, 5, 4, 3, 5] (avg=4.20, std=0.84)
  ID 4: [1, 5, 4, 4, 1] (avg=3.00, std=1.87)


## System

In [554]:
# Path to individual system outputs
RESULTS_DIR = Path("./dev")

# Get all jsonl files
prediction_files = sorted(RESULTS_DIR.glob("*.jsonl"))
print(f"Found {len(prediction_files)} prediction files:")
for f in prediction_files:
    print(f"  - {f.name}")

Found 6 prediction files:
  - david_v1.jsonl
  - david_v2.jsonl
  - korean.jsonl
  - urdu_v1.jsonl
  - urdu_v2.jsonl
  - urdu_v3.jsonl


In [555]:
# Load all predictions
all_predictions = {}
for pred_file in prediction_files:
    system_name = pred_file.stem  # filename without extension
    all_predictions[system_name] = load_predictions(pred_file)
    print(f"Loaded {len(all_predictions[system_name])} predictions from {system_name}")

# Convert to DataFrame for easier manipulation
# Each row is a sample, each column is a system's prediction
df_predictions = pd.DataFrame(all_predictions)
df_predictions.index.name = 'id'

print(f"\nPredictions DataFrame shape: {df_predictions.shape}")
print(f"Number of samples: {len(df_predictions)}")
print(f"Number of systems: {len(df_predictions.columns)}")
print(f"\nSystems: {list(df_predictions.columns)}")
print(f"\nSample data types:")
print(df_predictions.dtypes)
print(f"\nFirst 10 predictions:")
df_predictions.head(10)

Loaded 588 predictions from david_v1
Loaded 588 predictions from david_v2
Loaded 588 predictions from korean
Loaded 588 predictions from urdu_v1
Loaded 588 predictions from urdu_v2
Loaded 588 predictions from urdu_v3

Predictions DataFrame shape: (588, 6)
Number of samples: 588
Number of systems: 6

Systems: ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2', 'urdu_v3']

Sample data types:
david_v1      int64
david_v2    float64
korean      float64
urdu_v1       int64
urdu_v2       int64
urdu_v3       int64
dtype: object

First 10 predictions:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v1,urdu_v2,urdu_v3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4,4.06,4.360252,4,4,4
1,4,1.94,2.586742,3,2,3
2,5,3.9976,2.647276,4,3,4
3,5,2.0024,3.392001,4,4,4
4,5,3.548,2.816402,4,4,4
5,4,2.452,3.373284,4,3,4
6,4,4.5992,4.360252,2,4,2
7,1,1.4008,1.942182,2,3,2
8,3,2.8696,1.942182,2,2,2
9,3,3.1304,3.45069,4,5,4


In [556]:
# Create a combined DataFrame with predictions and gold labels
df_gold = pd.DataFrame({
    'gold_labels': gold_labels,
    'gold_avg': {k: np.mean(v) for k, v in gold_labels.items()},
    'gold_std': {k: np.std(v, ddof=1) for k, v in gold_labels.items()}
})

# Combine predictions with gold labels
df_combined = df_predictions.join(df_gold)


df_combined = df_combined.copy()
df_combined['group_id'] = np.arange(len(df_combined)) // 6

df_combined['unique_key'] = df_combined.index
print(f"Combined DataFrame shape: {df_combined.shape}")
print(f"\nColumns: {list(df_combined.columns)}")
print(f"\nFirst 10 rows:")
df_combined.head(10)

Combined DataFrame shape: (588, 11)

Columns: ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2', 'urdu_v3', 'gold_labels', 'gold_avg', 'gold_std', 'group_id', 'unique_key']

First 10 rows:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v1,urdu_v2,urdu_v3,gold_labels,gold_avg,gold_std,group_id,unique_key
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,4,4.06,4.360252,4,4,4,"[4, 5, 3, 1, 5]",3.6,1.67332,0,0
1,4,1.94,2.586742,3,2,3,"[3, 3, 4, 4, 4]",3.6,0.547723,0,1
2,5,3.9976,2.647276,4,3,4,"[5, 5, 2, 3, 4]",3.8,1.30384,0,2
3,5,2.0024,3.392001,4,4,4,"[4, 5, 4, 3, 5]",4.2,0.83666,0,3
4,5,3.548,2.816402,4,4,4,"[1, 5, 4, 4, 1]",3.0,1.870829,0,4
5,4,2.452,3.373284,4,3,4,"[4, 3, 4, 1, 3]",3.0,1.224745,0,5
6,4,4.5992,4.360252,2,4,2,"[4, 4, 5, 5, 5]",4.6,0.547723,1,6
7,1,1.4008,1.942182,2,3,2,"[1, 1, 1, 2, 2, 1]",1.333333,0.516398,1,7
8,3,2.8696,1.942182,2,2,2,"[4, 1, 1, 2, 3]",2.2,1.30384,1,8
9,3,3.1304,3.45069,4,5,4,"[4, 2, 5, 4, 4]",3.8,1.095445,1,9


# Split Data

In [557]:
# Use stratification based on binned gold_avg to ensure balanced distribution
df_combined['gold_bin'] = pd.cut(df_combined['gold_avg'], bins=5, labels=False)

group_labels = df_combined.groupby('group_id')['gold_bin'].mean().astype(int)
group_labels = group_labels.to_dict()

group_ids = df_combined['group_id'].unique()


group_labels_for_split = [group_labels[g] for g in group_ids]


train_groups, test_groups = train_test_split(
    group_ids, 
    test_size=0.3, 
    random_state=42,
    stratify=group_labels_for_split
)

train_df = df_combined[df_combined['group_id'].isin(train_groups)].reset_index(drop=True)
test_df  = df_combined[df_combined['group_id'].isin(test_groups)].reset_index(drop=True)

print(f"Total samples: {len(df_combined)}")
print(f"Training samples: {len(train_df)} ({len(train_df)/len(df_combined)*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/len(df_combined)*100:.1f}%)")

print(f"\nTraining set gold_avg distribution:")
print(f"  Mean: {train_df['gold_avg'].mean():.3f}")
print(f"  Std: {train_df['gold_avg'].std():.3f}")
print(f"  Min: {train_df['gold_avg'].min():.3f}")
print(f"  Max: {train_df['gold_avg'].max():.3f}")

print(f"\nTest set gold_avg distribution:")
print(f"  Mean: {test_df['gold_avg'].mean():.3f}")
print(f"  Std: {test_df['gold_avg'].std():.3f}")
print(f"  Min: {test_df['gold_avg'].min():.3f}")
print(f"  Max: {test_df['gold_avg'].max():.3f}")

print(f"\nTraining set sample IDs range: {train_df.index.min()} to {train_df.index.max()}")
print(f"Test set sample IDs range: {test_df.index.min()} to {test_df.index.max()}")

Total samples: 588
Training samples: 408 (69.4%)
Test samples: 180 (30.6%)

Training set gold_avg distribution:
  Mean: 3.121
  Std: 1.185
  Min: 1.000
  Max: 5.000

Test set gold_avg distribution:
  Mean: 3.112
  Std: 1.194
  Min: 1.000
  Max: 5.000

Training set sample IDs range: 0 to 407
Test set sample IDs range: 0 to 179


In [558]:
# Prepare training and test sets
# X = system predictions (features), y = gold average (target)

SYSTEMS = ['david_v2', 'korean', 'urdu_v1', 'urdu_v2', 'urdu_v3']

OBJECTIVE_FUNCTION = "rank:pairwise"

X_train = train_df[SYSTEMS.copy()]
# X_train = train_df[['david_v1', 'david_v2', 'korean']]
y_train = train_df['gold_avg']

X_test = test_df[SYSTEMS.copy()]
# X_test = test_df[['david_v1', 'david_v2', 'korean']]
y_test = test_df['gold_avg']

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"\nFirst few training samples:")
pd.concat([X_train.head(5), y_train.head(5)], axis=1)

X_train shape: (408, 5)
y_train shape: (408,)
X_test shape: (180, 5)
y_test shape: (180,)

First few training samples:


Unnamed: 0,david_v2,korean,urdu_v1,urdu_v2,urdu_v3,gold_avg
0,4.06,4.360252,4,4,4,3.6
1,1.94,2.586742,3,2,3,3.6
2,3.9976,2.647276,4,3,4,3.8
3,2.0024,3.392001,4,4,4,4.2
4,3.548,2.816402,4,4,4,3.0


# System Evaluation

In [559]:
# Evaluation functions following the structure of src/eval/scoring.py
def evaluate_predictions_array(y_pred, y_true_labels):
    """
    Evaluate predictions using the same logic as scoring.py
    
    Args:
        y_pred: array of predictions
        y_true_labels: list of gold label lists (5 ratings each)
    
    Returns:
        dict with spearman and accuracy scores
    """
    # Build prediction and gold lists (same structure as scoring.py)
    pred_list = list(y_pred)
    gold_list = [scoring.get_average(labels) for labels in y_true_labels]
    
    # Calculate Spearman correlation (same as scoring.py)
    corr, p_value = spearmanr(pred_list, gold_list)
    
    # Calculate accuracy within SD (same logic as scoring.py)
    correct_guesses = 0
    wrong_guesses = 0
    
    for pred, labels in zip(pred_list, y_true_labels):
        if scoring.is_within_standard_deviation(pred, labels):
            correct_guesses += 1
        else:
            wrong_guesses += 1
    
    accuracy = correct_guesses / (correct_guesses + wrong_guesses)
    
    return {
        'spearman': corr,
        'p_value': p_value,
        'accuracy': accuracy,
        'correct': correct_guesses,
        'total': correct_guesses + wrong_guesses
    }

In [560]:
# Evaluate each system on test set
print("=" * 80)
print("Individual System Performance on Test Set (30%)")
print("=" * 80)
print(f"{'System':<15} {'Spearman':>12} {'Accuracy':>12} {'Correct/Total':>15}")
print("-" * 80)

system_results = {}
test_gold_labels = test_df['gold_labels'].tolist()

for system in SYSTEMS.copy():
# for system in ['david_v1', 'david_v2', 'korean']:
    # Get predictions from X_test
    y_pred = X_test[system].values
    
    # Evaluate using official logic
    scores = evaluate_predictions_array(y_pred, test_gold_labels)
    system_results[system] = scores
    
    print(f"{system:<15} {scores['spearman']:>12.4%} {scores['accuracy']:>12.4%} "
          f"{scores['correct']:>7}/{scores['total']:<7}")

print("=" * 80)

Individual System Performance on Test Set (30%)
System              Spearman     Accuracy   Correct/Total
--------------------------------------------------------------------------------
david_v2            71.2139%     75.0000%     135/180    
korean              83.3728%     91.6667%     165/180    
urdu_v1             47.8694%     73.8889%     133/180    
urdu_v2             54.2010%     68.3333%     123/180    
urdu_v3             46.9711%     72.7778%     131/180    


# Ensemble

## Mean

In [561]:
# Simple Mean Ensemble: y_pred = (1/N) * sum(y_i)
y_pred_mean = X_test.mean(axis=1).values

# Evaluate float version
scores_mean_float = evaluate_predictions_array(y_pred_mean, test_gold_labels)
# print(f"Mean (float) - Spearman: {scores_mean_float['spearman']:.4%}, Accuracy: {scores_mean_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_mean_int = y_pred_mean.round().clip(1, 5).astype(int)
scores_mean_int = evaluate_predictions_array(y_pred_mean_int, test_gold_labels)
# print(f"Mean (int)   - Spearman: {scores_mean_int['spearman']:.4%}, Accuracy: {scores_mean_int['accuracy']:.4%}")

## Weighted Mean

In [562]:
# Weighted Mean: weights learned from train set (70%) based on Spearman performance
train_gold_labels = train_df['gold_labels'].tolist()

# Calculate weights based on train set performance (Spearman correlation)
weights = []
for system in SYSTEMS.copy():
# for system in ['david_v1', 'david_v2', 'korean']:
    y_pred_train = X_train[system].values
    scores_train = evaluate_predictions_array(y_pred_train, train_gold_labels)
    weights.append(scores_train['spearman'])

# Convert to numpy array and normalize to sum to 1
weights = np.array(weights)
weights = weights / weights.sum()

# Apply weighted average on test set
y_pred_weighted = (X_test.values * weights).sum(axis=1)

# Evaluate float version
scores_weighted_float = evaluate_predictions_array(y_pred_weighted, test_gold_labels)
# print(f"Weighted Mean (float) - Spearman: {scores_weighted_float['spearman']:.4%}, Accuracy: {scores_weighted_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_weighted_int = y_pred_weighted.round().clip(1, 5).astype(int)
scores_weighted_int = evaluate_predictions_array(y_pred_weighted_int, test_gold_labels)
# print(f"Weighted Mean (int)   - Spearman: {scores_weighted_int['spearman']:.4%}, Accuracy: {scores_weighted_int['accuracy']:.4%}")

## Majority Voting

In [563]:
# Majority Voting: most common rounded prediction across systems
# Round predictions to integers (1-5 range)
X_test_rounded = X_test.round().clip(1, 5).astype(int)

# Get mode (most frequent value) for each row
y_pred_majority = stats.mode(X_test_rounded, axis=1, keepdims=False)[0]

# Evaluate
scores_majority = evaluate_predictions_array(y_pred_majority, test_gold_labels)
# print(f"Majority Voting - Spearman: {scores_majority['spearman']:.4%}, Accuracy: {scores_majority['accuracy']:.4%}")

## Weighted Majority Voting

In [564]:
# Weighted Majority Voting: weighted voting using performance-based weights
# Round test predictions to integers
X_test_rounded = X_test.round().clip(1, 5).astype(int)

# Use weights learned earlier (from Weighted Mean)
y_pred_weighted_vote = []
for idx in range(len(X_test_rounded)):
    # Get predictions for this sample
    votes = X_test_rounded.iloc[idx].values
    
    # Count weighted votes for each class (1-5)
    vote_counts = {}
    for vote, weight in zip(votes, weights):
        vote_counts[vote] = vote_counts.get(vote, 0) + weight
    
    # Select class with highest weighted vote
    winner = max(vote_counts.items(), key=lambda x: x[1])[0]
    y_pred_weighted_vote.append(winner)

y_pred_weighted_vote = np.array(y_pred_weighted_vote)

# Evaluate
scores_weighted_vote = evaluate_predictions_array(y_pred_weighted_vote, test_gold_labels)
# print(f"Weighted Majority Voting - Spearman: {scores_weighted_vote['spearman']:.4%}, Accuracy: {scores_weighted_vote['accuracy']:.4%}")

## Linear Stacking (Ridge Regression)

In [565]:
# Linear Stacking: learn weights via Ridge Regression on train set
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Predict on test set
y_pred_ridge = ridge.predict(X_test)

# Evaluate float version
scores_ridge_float = evaluate_predictions_array(y_pred_ridge, test_gold_labels)
print(f"Ridge (float) - Spearman: {scores_ridge_float['spearman']:.4%}, Accuracy: {scores_ridge_float['accuracy']:.4%}")

# Evaluate integer version
y_pred_ridge_int = y_pred_ridge.round().clip(1, 5).astype(int)
scores_ridge_int = evaluate_predictions_array(y_pred_ridge_int, test_gold_labels)
# print(f"Ridge (int)   - Spearman: {scores_ridge_int['spearman']:.4%}, Accuracy: {scores_ridge_int['accuracy']:.4%}")

Ridge (float) - Spearman: 85.2726%, Accuracy: 93.3333%


In [566]:
# Inspect Ridge Weights
coef = ridge.coef_
for name, w in zip(X_train.columns, coef):
    print(f"{name:20s} {w:+.4f}")

david_v2             +0.1343
korean               +0.9156
urdu_v1              +0.1221
urdu_v2              +0.0779
urdu_v3              -0.0763


In [567]:
# Path to individual system outputs
RESULTS_DIR = Path("./test")

# Get all jsonl files
prediction_files_test = sorted(RESULTS_DIR.glob("*.jsonl"))
print(f"Found {len(prediction_files_test)} prediction files:")
for f in prediction_files_test:
    print(f"  - {f.name}")

Found 5 prediction files:
  - david_v1.jsonl
  - david_v2.jsonl
  - korean.jsonl
  - urdu_v2.jsonl
  - urdu_v3.jsonl


In [568]:
# Load all predictions
all_test_predictions = {}
for pred_file in prediction_files_test:
    system_name = pred_file.stem  # filename without extension
    all_test_predictions[system_name] = load_predictions(pred_file)
    print(f"Loaded {len(all_test_predictions[system_name])} predictions from {system_name}")
# Convert to DataFrame for easier manipulation
# Each row is a sample, each column is a system's prediction
df_predictions_test = pd.DataFrame(all_test_predictions)
df_predictions_test.index.name = 'id'

print(f"\nPredictions DataFrame shape: {df_predictions_test.shape}")
print(f"Number of samples: {len(df_predictions_test)}")
print(f"Number of systems: {len(df_predictions_test.columns)}")
print(f"\nSystems: {list(df_predictions_test.columns)}")
print(f"\nSample data types:")
print(df_predictions_test.dtypes)
print(f"\nFirst 10 predictions:")
df_predictions_test.head(10)

Loaded 930 predictions from david_v1
Loaded 930 predictions from david_v2
Loaded 930 predictions from korean
Loaded 930 predictions from urdu_v2
Loaded 930 predictions from urdu_v3

Predictions DataFrame shape: (930, 5)
Number of samples: 930
Number of systems: 5

Systems: ['david_v1', 'david_v2', 'korean', 'urdu_v2', 'urdu_v3']

Sample data types:
david_v1      int64
david_v2    float64
korean      float64
urdu_v2       int64
urdu_v3       int64
dtype: object

First 10 predictions:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v2,urdu_v3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4,4.878,4.360252,5,4
1,3,1.122,1.942182,4,3
2,4,4.6828,4.360252,4,4
3,4,1.3172,1.942182,3,3
4,5,4.756,4.397248,4,4
5,3,1.244,2.05128,4,3
6,5,2.4116,1.942182,3,3
7,3,3.5884,4.0336,3,4
8,2,2.742,1.942182,3,3
9,3,3.258,4.0336,2,4


In [569]:
str_for_part_of_filename = '_'.join(SYSTEMS)
addr_saving = f'ensemble_{str_for_part_of_filename}_ridge_testset_predictions.jsonl'

X_test_set_real = df_predictions_test[SYSTEMS.copy()]

y_pred_ridge_test_set = ridge.predict(X_test_set_real)

print(len(y_pred_ridge_test_set))

a = 0

with open(addr_saving, 'w', encoding='utf-8') as f:
    for val in y_pred_ridge_test_set:
        f.write('{"id": "' + str(a) + '", "prediction": ' + str(val) + '}\n')
        a += 1

KeyError: "['urdu_v1'] not in index"

In [None]:
def scale_to_these_extremes(arr, low, upp):
    """Scale array values to be within the specified extremes."""
    arr_scaled = (arr - arr.min()) / (arr.max() - arr.min()) * (upp - low) + low
    return arr_scaled

## XGBoost

In [None]:
def train_eval_xgb(
    X_train,
    y_train,
    X_test,
    test_gold_labels,
    low,
    upp,
    *,
    n_estimators=500,
    max_depth=2,
    learning_rate=0.05,
    min_child_weight=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,     # L1
    reg_lambda=10.0,   # L2
    random_state=42,
):
    
    group_train_big = [len(X_train)]
    
    group_train_2 = [2] * (len(y_train) // 2)

   
    group_train_6 = [6] * (len(y_train) // 6)
    

    group_train_12 = [12] * (len(y_train) // 12)

    xgb = XGBRanker(
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    min_child_weight=min_child_weight,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    reg_alpha=reg_alpha,
    reg_lambda=reg_lambda,
    objective=OBJECTIVE_FUNCTION,
    random_state=random_state,
    n_jobs=-1,
)

    xgb.fit(
    X_train,
    y_train,
    group=group_train_6,
    verbose=False,
)

    # Predict
    y_pred = scale_to_these_extremes(xgb.predict(X_test), low, upp)

    # Float eval
    scores_float = evaluate_predictions_array(y_pred, test_gold_labels)

    # Int eval
    # y_pred_int = y_pred.round().clip(1, 5).astype(int)
    # scores_int = evaluate_predictions_array(y_pred_int, test_gold_labels)

    return scores_float


In [None]:
depths = [1, 2, 3]

configs = [
    dict(learning_rate=0.1,  n_estimators=200, low=1, upp=5),
    dict(learning_rate=0.05, n_estimators=400, low=1, upp=5),
    dict(learning_rate=0.02, n_estimators=800, low=1, upp=5),
]


regularizers = [
    dict(reg_alpha=0.0, reg_lambda=1.0),
    dict(reg_alpha=1.0, reg_lambda=10.0),
    dict(reg_alpha=10.0, reg_lambda=50.0),
]

for depth in [1, 2, 3]:
    print()
    for cfg in configs:
        scores_float = train_eval_xgb(
            X_train,
            y_train,
            X_test,
            test_gold_labels,
            max_depth=depth,
            **cfg
        )

        print(
            f"depth={depth} lr={cfg['learning_rate']} n={cfg['n_estimators']} | "
            f"Spearman={scores_float['spearman']:.4%} "
            f"Acc={scores_float['accuracy']:.4%}"
        )



depth=1 lr=0.1 n=200 | Spearman=84.1200% Acc=88.8889%


depth=1 lr=0.05 n=400 | Spearman=84.3027% Acc=87.7778%
depth=1 lr=0.02 n=800 | Spearman=83.9374% Acc=86.6667%

depth=2 lr=0.1 n=200 | Spearman=82.6734% Acc=86.1111%
depth=2 lr=0.05 n=400 | Spearman=83.0838% Acc=86.6667%
depth=2 lr=0.02 n=800 | Spearman=83.2871% Acc=87.7778%

depth=3 lr=0.1 n=200 | Spearman=81.5956% Acc=85.5556%
depth=3 lr=0.05 n=400 | Spearman=81.4504% Acc=85.0000%
depth=3 lr=0.02 n=800 | Spearman=81.5551% Acc=85.5556%


In [570]:
# XGBoost with competition-validated settings

if OBJECTIVE_FUNCTION.startswith("rank:") :
 xgb = XGBRanker(
    n_estimators=400,          # matched to lr=0.1
    max_depth=1,               # CRITICAL: best Spearman
    learning_rate=0.05,
    min_child_weight=50,       # strong regularization
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,             # L1 regularization
    reg_lambda=10.0,           # L2 regularization
    objective=OBJECTIVE_FUNCTION,
    random_state=0,
    n_jobs=-1,
)
else:
 assert False



group_test_6 = [6] * (len(y_train) // 6)
    
    
# Train on training set
xgb.fit(X_train, y_train, group=group_test_6)

# Predict on test set
y_pred_xgb = scale_to_these_extremes(xgb.predict(X_test), 1, 5)

# Evaluate float version
scores_xgb_float = evaluate_predictions_array(y_pred_xgb, test_gold_labels)
print(
    f"XGBoost depth=1 (float) - "
    f"Spearman: {scores_xgb_float['spearman']:.4%}, "
    f"Accuracy: {scores_xgb_float['accuracy']:.4%}"
)

# Evaluate integer version
# y_pred_xgb_int = y_pred_xgb.round().clip(1, 5).astype(int)
# scores_xgb_int = evaluate_predictions_array(y_pred_xgb_int, test_gold_labels)
# print(
#     f"XGBoost depth=1 (int)   - "
#     f"Spearman: {scores_xgb_int['spearman']:.4%}, "
#     f"Accuracy: {scores_xgb_int['accuracy']:.4%}"
# )


XGBoost depth=1 (float) - Spearman: 85.3826%, Accuracy: 89.4444%
