In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'Author'
__email__ = 'Email'

# SemEval 2026 Task 5 - Ensemble

In [2]:
# dependency
# built-in
import json
import os
import sys
from pathlib import Path

# third-party
import random
import pandas as pd
import numpy as np
from scipy import stats
from xgboost import XGBRegressor
from scipy.stats import spearmanr
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# local - add src/eval to path for importing evaluation functions
sys.path.insert(0, str(Path('../src/eval').resolve()))
# Import evaluation functions from src/eval/scoring.py
import scoring

%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

# Input

In [3]:
# helper
def set_seed(seed=42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)

def load_predictions(filepath):
    """Load predictions from a JSONL file into a dictionary."""
    predictions = {}
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            predictions[data['id']] = data['prediction']
    return predictions

In [4]:
# init
set_seed(0)

## Data

In [5]:
# Load gold labels (solution file)
SOLUTION_FILE = Path("../res/data/dev_solution.jsonl")

gold_labels = {}
with open(SOLUTION_FILE, 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        gold_labels[data['id']] = data['label']

print(f"Loaded {len(gold_labels)} gold labels")
print(f"\nGold labels are lists of 5 human ratings (1-5 scale)")
print(f"\nExample gold labels:")
for i in range(5):
    sample_id = str(i)
    print(f"  ID {sample_id}: {gold_labels[sample_id]} (avg={np.mean(gold_labels[sample_id]):.2f}, std={np.std(gold_labels[sample_id], ddof=1):.2f})")

Loaded 588 gold labels

Gold labels are lists of 5 human ratings (1-5 scale)

Example gold labels:
  ID 0: [4, 5, 3, 1, 5] (avg=3.60, std=1.67)
  ID 1: [3, 3, 4, 4, 4] (avg=3.60, std=0.55)
  ID 2: [5, 5, 2, 3, 4] (avg=3.80, std=1.30)
  ID 3: [4, 5, 4, 3, 5] (avg=4.20, std=0.84)
  ID 4: [1, 5, 4, 4, 1] (avg=3.00, std=1.87)


## System

In [6]:
# Path to individual system outputs
RESULTS_DIR = Path("./dev")

# Get all jsonl files
prediction_files = sorted(RESULTS_DIR.glob("*.jsonl"))
print(f"Found {len(prediction_files)} prediction files:")
for f in prediction_files:
    print(f"  - {f.name}")

Found 5 prediction files:
  - david_v1.jsonl
  - david_v2.jsonl
  - korean.jsonl
  - urdu_v1.jsonl
  - urdu_v2.jsonl


In [7]:
# Load all predictions
all_predictions = {}
for pred_file in prediction_files:
    system_name = pred_file.stem  # filename without extension
    all_predictions[system_name] = load_predictions(pred_file)
    print(f"Loaded {len(all_predictions[system_name])} predictions from {system_name}")

# Convert to DataFrame for easier manipulation
# Each row is a sample, each column is a system's prediction
df_predictions = pd.DataFrame(all_predictions)
df_predictions.index.name = 'id'

print(f"\nPredictions DataFrame shape: {df_predictions.shape}")
print(f"Number of samples: {len(df_predictions)}")
print(f"Number of systems: {len(df_predictions.columns)}")
print(f"\nSystems: {list(df_predictions.columns)}")
print(f"\nSample data types:")
print(df_predictions.dtypes)
print(f"\nFirst 10 predictions:")
df_predictions.head(10)

Loaded 588 predictions from david_v1
Loaded 588 predictions from david_v2
Loaded 588 predictions from korean
Loaded 588 predictions from urdu_v1
Loaded 588 predictions from urdu_v2

Predictions DataFrame shape: (588, 5)
Number of samples: 588
Number of systems: 5

Systems: ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2']

Sample data types:
david_v1      int64
david_v2    float64
korean      float64
urdu_v1       int64
urdu_v2       int64
dtype: object

First 10 predictions:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v1,urdu_v2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4,4.06,4.360252,4,4
1,4,1.94,2.586742,3,2
2,5,3.9976,2.647276,4,3
3,5,2.0024,3.392001,4,4
4,5,3.548,2.816402,4,4
5,4,2.452,3.373284,4,3
6,4,4.5992,4.360252,2,4
7,1,1.4008,1.942182,2,3
8,3,2.8696,1.942182,2,2
9,3,3.1304,3.45069,4,5


In [8]:
# Create a combined DataFrame with predictions and gold labels
df_gold = pd.DataFrame({
    'gold_labels': gold_labels,
    'gold_avg': {k: np.mean(v) for k, v in gold_labels.items()},
    'gold_std': {k: np.std(v, ddof=1) for k, v in gold_labels.items()}
})

# Combine predictions with gold labels
df_combined = df_predictions.join(df_gold)
df_combined['unique_key'] = df_combined.index
print(f"Combined DataFrame shape: {df_combined.shape}")
print(f"\nColumns: {list(df_combined.columns)}")
print(f"\nFirst 10 rows:")
df_combined.head(10)

Combined DataFrame shape: (588, 9)

Columns: ['david_v1', 'david_v2', 'korean', 'urdu_v1', 'urdu_v2', 'gold_labels', 'gold_avg', 'gold_std', 'unique_key']

First 10 rows:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v1,urdu_v2,gold_labels,gold_avg,gold_std,unique_key
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,4,4.06,4.360252,4,4,"[4, 5, 3, 1, 5]",3.6,1.67332,0
1,4,1.94,2.586742,3,2,"[3, 3, 4, 4, 4]",3.6,0.547723,1
2,5,3.9976,2.647276,4,3,"[5, 5, 2, 3, 4]",3.8,1.30384,2
3,5,2.0024,3.392001,4,4,"[4, 5, 4, 3, 5]",4.2,0.83666,3
4,5,3.548,2.816402,4,4,"[1, 5, 4, 4, 1]",3.0,1.870829,4
5,4,2.452,3.373284,4,3,"[4, 3, 4, 1, 3]",3.0,1.224745,5
6,4,4.5992,4.360252,2,4,"[4, 4, 5, 5, 5]",4.6,0.547723,6
7,1,1.4008,1.942182,2,3,"[1, 1, 1, 2, 2, 1]",1.333333,0.516398,7
8,3,2.8696,1.942182,2,2,"[4, 1, 1, 2, 3]",2.2,1.30384,8
9,3,3.1304,3.45069,4,5,"[4, 2, 5, 4, 4]",3.8,1.095445,9


# Split Data

In [9]:
# Use stratification based on binned gold_avg to ensure balanced distribution
df_combined['gold_bin'] = pd.cut(df_combined['gold_avg'], bins=5, labels=False)

# Drop the temporary binning column
train_df = df_combined.drop('gold_bin', axis=1)

print(f"Total samples: {len(df_combined)}")
print(f"Training samples: {len(train_df)} ({len(train_df)/len(df_combined)*100:.1f}%)")

print(f"\nTraining set gold_avg distribution:")
print(f"  Mean: {train_df['gold_avg'].mean():.3f}")
print(f"  Std: {train_df['gold_avg'].std():.3f}")
print(f"  Min: {train_df['gold_avg'].min():.3f}")
print(f"  Max: {train_df['gold_avg'].max():.3f}")



print(f"\nTraining set sample IDs range: {train_df.index.min()} to {train_df.index.max()}")


Total samples: 588
Training samples: 588 (100.0%)

Training set gold_avg distribution:
  Mean: 3.118
  Std: 1.186
  Min: 1.000
  Max: 5.000

Training set sample IDs range: 0 to 99


In [10]:
# Prepare training and test sets
# X = system predictions (features), y = gold average (target)

SYSTEMS = ['david_v1', 'david_v2', 'korean', 'urdu_v2']

X_train = train_df[SYSTEMS.copy()]
# X_train = train_df[['david_v1', 'david_v2', 'korean']]
y_train = train_df['gold_avg']


print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


print(f"\nFirst few training samples:")
pd.concat([X_train.head(), y_train.head()], axis=1)

X_train shape: (588, 4)
y_train shape: (588,)

First few training samples:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v2,gold_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4,4.06,4.360252,4,3.6
1,4,1.94,2.586742,2,3.6
2,5,3.9976,2.647276,3,3.8
3,5,2.0024,3.392001,4,4.2
4,5,3.548,2.816402,4,3.0


# System Evaluation

In [11]:
# Evaluation functions following the structure of src/eval/scoring.py
def evaluate_predictions_array(y_pred, y_true_labels):
    """
    Evaluate predictions using the same logic as scoring.py
    
    Args:
        y_pred: array of predictions
        y_true_labels: list of gold label lists (5 ratings each)
    
    Returns:
        dict with spearman and accuracy scores
    """
    # Build prediction and gold lists (same structure as scoring.py)
    pred_list = list(y_pred)
    gold_list = [scoring.get_average(labels) for labels in y_true_labels]
    
    # Calculate Spearman correlation (same as scoring.py)
    corr, p_value = spearmanr(pred_list, gold_list)
    
    # Calculate accuracy within SD (same logic as scoring.py)
    correct_guesses = 0
    wrong_guesses = 0
    
    for pred, labels in zip(pred_list, y_true_labels):
        if scoring.is_within_standard_deviation(pred, labels):
            correct_guesses += 1
        else:
            wrong_guesses += 1
    
    accuracy = correct_guesses / (correct_guesses + wrong_guesses)
    
    return {
        'spearman': corr,
        'p_value': p_value,
        'accuracy': accuracy,
        'correct': correct_guesses,
        'total': correct_guesses + wrong_guesses
    }

# Ensemble

## Linear Stacking (Ridge Regression)

In [12]:
# Linear Stacking: learn weights via Ridge Regression on train set
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)



0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [13]:
# Inspect Ridge Weights
coef = ridge.coef_
for name, w in zip(X_train.columns, coef):
    print(f"{name:20s} {w:+.4f}")

david_v1             +0.1144
david_v2             +0.1228
korean               +0.8750
urdu_v2              +0.0937


In [14]:
# Path to individual system outputs
RESULTS_DIR = Path("./test")

# Get all jsonl files
prediction_files_test = sorted(RESULTS_DIR.glob("*.jsonl"))
print(f"Found {len(prediction_files_test)} prediction files:")
for f in prediction_files_test:
    print(f"  - {f.name}")

Found 4 prediction files:
  - david_v1.jsonl
  - david_v2.jsonl
  - korean.jsonl
  - urdu_v2.jsonl


In [15]:
# Load all predictions
all_test_predictions = {}
for pred_file in prediction_files_test:
    system_name = pred_file.stem  # filename without extension
    all_test_predictions[system_name] = load_predictions(pred_file)
    print(f"Loaded {len(all_test_predictions[system_name])} predictions from {system_name}")
# Convert to DataFrame for easier manipulation
# Each row is a sample, each column is a system's prediction
df_predictions_test = pd.DataFrame(all_test_predictions)
df_predictions_test.index.name = 'id'

print(f"\nPredictions DataFrame shape: {df_predictions_test.shape}")
print(f"Number of samples: {len(df_predictions_test)}")
print(f"Number of systems: {len(df_predictions_test.columns)}")
print(f"\nSystems: {list(df_predictions_test.columns)}")
print(f"\nSample data types:")
print(df_predictions_test.dtypes)
print(f"\nFirst 10 predictions:")
df_predictions_test.head(10)

Loaded 930 predictions from david_v1
Loaded 930 predictions from david_v2
Loaded 930 predictions from korean
Loaded 930 predictions from urdu_v2

Predictions DataFrame shape: (930, 4)
Number of samples: 930
Number of systems: 4

Systems: ['david_v1', 'david_v2', 'korean', 'urdu_v2']

Sample data types:
david_v1      int64
david_v2    float64
korean      float64
urdu_v2       int64
dtype: object

First 10 predictions:


Unnamed: 0_level_0,david_v1,david_v2,korean,urdu_v2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4,4.878,4.360252,5
1,3,1.122,1.942182,4
2,4,4.6828,4.360252,4
3,4,1.3172,1.942182,3
4,5,4.756,4.397248,4
5,3,1.244,2.05128,4
6,5,2.4116,1.942182,3
7,3,3.5884,4.0336,3
8,2,2.742,1.942182,3
9,3,3.258,4.0336,2


In [16]:
str_for_part_of_filename = '_'.join(SYSTEMS)
addr_saving = f'ensemble_{str_for_part_of_filename}_ridge_testset_predictions.jsonl'

X_test_set_real = df_predictions_test[SYSTEMS.copy()]

y_pred_ridge_test_set = ridge.predict(X_test_set_real)

print(len(y_pred_ridge_test_set))

a = 0

with open(addr_saving, 'w', encoding='utf-8') as f:
    for val in y_pred_ridge_test_set:
        f.write('{"id": "' + str(a) + '", "prediction": ' + str(val) + '}\n')
        a += 1

930
