# SNCF Transilien: Fast Ensemble for p0q0 (MAE)

This notebook trains a fast ensemble to predict p0q0 (difference between theoretical and realized waiting time) and exports `y_test.csv`. It prioritizes speed and competitive MAE using simple, leakage-safe features and an efficient blend of models.

In [2]:
# Imports and robust data_dir resolution
import os
from pathlib import Path
import warnings

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

# Reproducibility
RANDOM_STATE = 42

# LightGBM availability (optional fast model)
try:
    import lightgbm as lgb
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

# --- Robust path detection ---
# Prefer forward-slash UNC to avoid backslash escaping issues on Windows
candidate_dirs = [
    Path('//wsl.localhost/Ubuntu-24.04/home/utharushan/ChallengeData/SNCF-Transilien-challenge'),
    Path.cwd(),
]

required_files = {'x_train_final.csv', 'x_test_final.csv'}

chosen: Path | None = None
for cand in candidate_dirs:
    try:
        if cand.exists() and cand.is_dir():
            present = {p.name for p in cand.iterdir() if p.is_file()}
            if required_files.issubset(present):
                chosen = cand
                break
    except OSError:
        # Inaccessible path (e.g., network/unmounted), skip
        continue

if chosen is None:
    # If not all required files found, still try to pick a dir that at least exists
    for cand in candidate_dirs:
        if cand.exists() and cand.is_dir():
            chosen = cand
            break

if chosen is None:
    raise FileNotFoundError(
        'Aucun répertoire de données valide trouvé. Vérifiez le chemin des fichiers CSV. '
        'Essayé: ' + ' | '.join(str(p) for p in candidate_dirs)
    )

# Ensure the two required files exist here
missing = [fname for fname in required_files if not (chosen / fname).exists()]
if missing:
    raise FileNotFoundError(
        f"Fichiers manquants dans {chosen}: {missing}.\n"
        "Assurez-vous que 'x_train_final.csv' et 'x_test_final.csv' sont bien présents."
    )

# Select y_train file by pattern
y_candidates = sorted((p for p in chosen.glob('y_train*.csv')), key=lambda p: p.name)
if not y_candidates:
    raise FileNotFoundError(
        f"Aucun fichier y_train* trouvé dans {chosen}. Placez le fichier de vérité terrain dans ce dossier."
    )

# Export variables used later
data_dir = str(chosen)
train_path = os.path.join(data_dir, 'x_train_final.csv')
test_path = os.path.join(data_dir, 'x_test_final.csv')
y_train_path = str(y_candidates[0])

print(f"Using data_dir: {data_dir}")
print(f"Detected y_train file: {Path(y_train_path).name}")

Using data_dir: \\wsl.localhost\Ubuntu-24.04\home\utharushan\ChallengeData\SNCF-Transilien-challenge
Detected y_train file: y_train_final_j5KGWWK.csv


In [3]:
# 2) Load Data (train/test) from CSV
with Timer() as t:
    X = pd.read_csv(train_path)
    X_test = pd.read_csv(test_path)
    y = pd.read_csv(y_train_path)
print(f"Loaded data in {t.dt:.2f}s: X={X.shape}, X_test={X_test.shape}, y={y.shape}")

# Align and clean columns
# Remove potential unnamed index columns
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]

# y column may be named p0q0 or similar; ensure it's named 'p0q0'
if y.shape[1] == 1:
    y.columns = ['p0q0']
else:
    # If there's an index + a single target col, take the last
    y = y.iloc[:, [-1]]
    y.columns = ['p0q0']

# Coerce dtypes
for col in ['train', 'gare']:
    if col in X.columns:
        X[col] = X[col].astype(str)
        if col in X_test.columns:
            X_test[col] = X_test[col].astype(str)

if 'date' in X.columns:
    X['date'] = pd.to_datetime(X['date'])
if 'date' in X_test.columns:
    X_test['date'] = pd.to_datetime(X_test['date'])

if 'arret' in X.columns:
    X['arret'] = X['arret'].astype(int)
if 'arret' in X_test.columns:
    X_test['arret'] = X_test['arret'].astype(int)

lag_cols = ['p2q0','p3q0','p4q0','p0q2','p0q3','p0q4']
for c in lag_cols:
    if c in X.columns:
        X[c] = X[c].astype(float)
    if c in X_test.columns:
        X_test[c] = X_test[c].astype(float)

# Ensure train/test columns match except target
feature_cols = [c for c in X.columns if c != 'p0q0']
print("Train columns:", feature_cols)
print("Test columns:", list(X_test.columns))

Loaded data in 0.81s: X=(667264, 12), X_test=(20657, 11), y=(667264, 2)
Train columns: ['train', 'gare', 'date', 'arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4']
Test columns: ['Unnamed: 0', 'train', 'gare', 'date', 'arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4']


In [4]:
# 3) Basic Data Checks and Sanity Validations
print(X.head(3))
print(X_test.head(3))
print("y head:\n", y.head())

print("Missing in X:\n", X.isna().sum())
print("Missing in X_test:\n", X_test.isna().sum())

# Sanity: ensure target length matches X rows
assert len(y) == len(X), (len(y), len(X))

# Unique counts
for col in ['train','gare']:
    if col in X.columns:
        print(f"Unique {col}: train={X[col].nunique()} test={X_test[col].nunique()}")

if 'date' in X.columns:
    print("Date range train:", X['date'].min(), X['date'].max())
    print("Date range test:", X_test['date'].min(), X_test['date'].max())

print('Target stats:', y['p0q0'].describe())

    train gare       date  arret  p2q0  p3q0  p4q0  p0q2  p0q3  p0q4
0  VBXNMF  KYF 2023-04-03      8   0.0   0.0   1.0  -3.0  -1.0  -2.0
1  VBXNMF  JLR 2023-04-03      9   0.0   0.0   0.0   1.0   0.0   1.0
2  VBXNMF  EOH 2023-04-03     10  -1.0   0.0   0.0  -1.0   0.0   0.0
   Unnamed: 0   train gare       date  arret  p2q0  p3q0  p4q0  p0q2  p0q3  p0q4
0           0  ZPQEKP  VXY 2023-11-13     12   0.0   0.0  -2.0  -4.0  -2.0  -4.0
1           1  KIQSRA  VXY 2023-11-13     12   0.0   0.0  -1.0   1.0  -1.0   0.0
2           2  QQJYYT  VXY 2023-11-13     12   0.0   1.0  -1.0   1.0  -1.0   1.0
y head:
    p0q0
0  -1.0
1  -1.0
2  -1.0
3   1.0
4   3.0
Missing in X:
 train    0
gare     0
date     0
arret    0
p2q0     0
p3q0     0
p4q0     0
p0q2     0
p0q3     0
p0q4     0
dtype: int64
Missing in X_test:
 Unnamed: 0    0
train         0
gare          0
date          0
arret         0
p2q0          0
p3q0          0
p4q0          0
p0q2          0
p0q3          0
p0q4          0
dtype: in

In [5]:
# 4) Feature Engineering (dates, simple aggregates)
with Timer() as t:
    def add_date_features(df):
        if 'date' not in df.columns:
            return df
        df = df.copy()
        df['year'] = df['date'].dt.year.astype(int)
        df['month'] = df['date'].dt.month.astype(int)
        df['day'] = df['date'].dt.day.astype(int)
        df['weekday'] = df['date'].dt.weekday.astype(int)
        df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
        df['dayofyear'] = df['date'].dt.dayofyear.astype(int)
        return df

    X = add_date_features(X)
    X_test = add_date_features(X_test)

    # Simple, leakage-safe aggregates from existing lag features
    agg_keys = ['gare'] if 'gare' in X.columns else []
    agg_features = {}
    for c in lag_cols:
        if c in X.columns:
            agg_features[c] = ['mean','median','std']

    if agg_keys and agg_features:
        agg_df = (X.groupby(agg_keys)[list(agg_features.keys())]
                    .agg(agg_features))
        # flatten columns
        agg_df.columns = [f"{a}_{b}" for a,b in agg_df.columns]
        agg_df = agg_df.reset_index()
        # fillna with global means to speedup join handling
        agg_df = agg_df.fillna(agg_df.mean(numeric_only=True))

        X = X.merge(agg_df, on=agg_keys, how='left')
        X_test = X_test.merge(agg_df, on=agg_keys, how='left')

        # For unseen keys in test, fill with global means from train
        for col in agg_df.columns:
            if col in X_test.columns:
                if X_test[col].isna().any():
                    fill_val = X[col].mean() if col in X.columns else 0.0
                    X_test[col] = X_test[col].fillna(fill_val)

print(f"Feature engineering done in {t.dt:.2f}s")

Feature engineering done in 0.44s


In [6]:
# 5) Build Preprocessing Pipeline (ColumnTransformer)
from sklearn.pipeline import Pipeline

all_cols = [c for c in X.columns]

categorical_cols = [c for c in ['train','gare'] if c in X.columns]
drop_cols = []
if 'date' in X.columns:
    # drop raw date to avoid leakage through ordering; engineered features kept
    drop_cols.append('date')

numeric_cols = [c for c in all_cols if c not in set(categorical_cols + drop_cols)]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='median')),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder='drop'
)

print("Numeric cols (", len(numeric_cols), "):", numeric_cols[:20], '...')
print("Categorical cols (", len(categorical_cols), "):", categorical_cols)


Numeric cols ( 31 ): ['arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4', 'year', 'month', 'day', 'weekday', 'weekofyear', 'dayofyear', 'p2q0_mean', 'p2q0_median', 'p2q0_std', 'p3q0_mean', 'p3q0_median', 'p3q0_std', 'p4q0_mean'] ...
Categorical cols ( 2 ): ['train', 'gare']


In [7]:
# 6) Train/Validation Split (date-aware)
# If multiple dates, use the latest date as validation; else k-fold fallback
if 'date' in X.columns and X['date'].nunique() > 1:
    last_date = X['date'].max()
    train_mask = X['date'] < last_date
    valid_mask = X['date'] == last_date
    X_tr, X_va = X.loc[train_mask].copy(), X.loc[valid_mask].copy()
    y_tr, y_va = y.loc[train_mask, 'p0q0'].values, y.loc[valid_mask, 'p0q0'].values
    split_desc = f"time-based split by date, valid_date={last_date.date()}"
else:
    X_tr, X_va, y_tr, y_va = train_test_split(X, y['p0q0'].values, test_size=0.1, random_state=RANDOM_STATE)
    split_desc = "random 90/10 split (no multiple dates)"

# Optional downsampling of training rows for speed if extremely large
MAX_TRAIN_ROWS = 250_000  # cap to ensure quick training while keeping good signal
if len(X_tr) > MAX_TRAIN_ROWS:
    idx = np.random.RandomState(RANDOM_STATE).choice(len(X_tr), size=MAX_TRAIN_ROWS, replace=False)
    X_tr = X_tr.iloc[idx]
    y_tr = y_tr[idx]
    print(f"Downsampled training split to {len(X_tr):,} rows for speed")

print("Split:", split_desc)

with Timer() as t:
    Xt_tr = preprocessor.fit_transform(X_tr)
    Xt_va = preprocessor.transform(X_va)
    Xt_full = preprocessor.fit_transform(X)  # for refit later
    Xt_test = preprocessor.transform(X_test)
print(f"Preprocessing fit+transform in {t.dt:.2f}s; Shapes: tr={Xt_tr.shape}, va={Xt_va.shape}, full={Xt_full.shape}, test={Xt_test.shape}")

Downsampled training split to 250,000 rows for speed
Split: time-based split by date, valid_date=2023-11-10
Preprocessing fit+transform in 2.80s; Shapes: tr=(250000, 33), va=(7026, 33), full=(667264, 33), test=(20657, 33)
Preprocessing fit+transform in 2.80s; Shapes: tr=(250000, 33), va=(7026, 33), full=(667264, 33), test=(20657, 33)


In [8]:
# 7) Define Fast Base Models (HGB, RF, ET, Ridge, optional LGBM)
models = []
# HistGradientBoosting - strong and fast
models.append((
    'hgb',
    HistGradientBoostingRegressor(
        loss='absolute_error',
        learning_rate=0.06,
        max_depth=6,
        max_iter=200,  # keep tight for speed
        early_stopping=True,
        random_state=RANDOM_STATE
    )
))
# Random Forest - solid baseline
models.append((
    'rf',
    RandomForestRegressor(
        n_estimators=200,   # reduced for speed
        max_depth=12,       # reduced depth
        n_jobs=-1,
        bootstrap=False,
        random_state=RANDOM_STATE
    )
))
# Extra Trees - diverse ensemble member
models.append((
    'et',
    ExtraTreesRegressor(
        n_estimators=200,   # reduced for speed
        max_depth=14,       # moderate depth
        n_jobs=-1,
        bootstrap=False,
        random_state=RANDOM_STATE
    )
))
# Ridge - linear anchor
models.append((
    'ridge',
    Ridge(alpha=2.0, random_state=RANDOM_STATE)
))

if HAS_LGBM:
    models.append((
        'lgbm',
        lgb.LGBMRegressor(
            n_estimators=1500,  # rely on early stopping
            learning_rate=0.03,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.2,
            random_state=RANDOM_STATE,
            objective='l1'
        )
    ))

print("Models:", [name for name,_ in models])

Models: ['hgb', 'rf', 'et', 'ridge', 'lgbm']


In [9]:
# 8) Fit each model and get validation predictions
val_preds = {}
fit_models = {}

with Timer() as t:
    for name, model in models:
        with Timer() as tm:
            if name == 'lgbm' and HAS_LGBM:
                # small early stopping split
                model.fit(
                    Xt_tr, y_tr,
                    eval_set=[(Xt_va, y_va)],
                    eval_metric='l1',
                    callbacks=[lgb.early_stopping(50, verbose=False)]
                )
            else:
                model.fit(Xt_tr, y_tr)
            pred = model.predict(Xt_va)
        mae = mean_absolute_error(y_va, pred)
        val_preds[name] = pred
        fit_models[name] = model
        print(f"Model {name} fit in {tm.dt:.2f}s, valid MAE={mae:.4f}")

print(f"All models trained in {t.dt:.2f}s")

Model hgb fit in 5.33s, valid MAE=1.4205
Model rf fit in 45.82s, valid MAE=1.7267
Model rf fit in 45.82s, valid MAE=1.7267
Model et fit in 27.22s, valid MAE=1.5084
Model ridge fit in 0.08s, valid MAE=1.5388
Model et fit in 27.22s, valid MAE=1.5084
Model ridge fit in 0.08s, valid MAE=1.5388
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1907
[LightGBM] [Info] Number of data points in the train set: 250000, number of used features: 32
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1907
[LightGBM] [Info] Number of data points in the train set: 250000, number of used features: 32
Model lgbm fit in 13.78s, valid MAE=1.3514
All models trained in 92.23s
Model lgbm fit in 13.78s, valid MAE=1.3514
All models tra

In [10]:
# 9) Blend weights via small simplex grid search
from itertools import product

model_names = list(val_preds.keys())
P = np.vstack([val_preds[m] for m in model_names]).T  # shape (n, M)

best_mae = float('inf')
best_w = None
best_combo = None

steps = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
for w in product(steps, repeat=len(model_names)):
    if abs(sum(w) - 1.0) > 1e-9:
        continue
    y_blend = (P * np.array(w)).sum(axis=1)
    mae = mean_absolute_error(y_va, y_blend)
    if mae < best_mae:
        best_mae = mae
        best_w = np.array(w)
        best_combo = dict(zip(model_names, w))

print("Best ensemble MAE=", round(best_mae, 4))
print("Selected weights:", best_combo)

# MAE formula for reporting
print("MAE definition: MAE = (1/n) * sum(|y_i - yhat_i|)")

Best ensemble MAE= 1.3514
Selected weights: {'hgb': 0.0, 'rf': 0.0, 'et': 0.0, 'ridge': 0.0, 'lgbm': 1.0}
MAE definition: MAE = (1/n) * sum(|y_i - yhat_i|)


In [11]:
# 10) Retrain base models on full train
fit_models_full = {}
with Timer() as t:
    for name, model in models:
        with Timer() as tm:
            if name == 'lgbm' and HAS_LGBM:
                model.set_params(n_estimators=2000, learning_rate=0.025)
                model.fit(
                    Xt_full, y['p0q0'].values,
                    eval_set=[(Xt_va, y_va)],
                    eval_metric='l1',
                    callbacks=[lgb.early_stopping(50, verbose=False)]
                )
            else:
                model.fit(Xt_full, y['p0q0'].values)
        fit_models_full[name] = model
        print(f"Refit {name} in {tm.dt:.2f}s")
print(f"Refit all models in {t.dt:.2f}s")

Refit hgb in 8.45s
Refit rf in 221.90s
Refit rf in 221.90s
Refit et in 174.70s
Refit ridge in 0.16s
Refit et in 174.70s
Refit ridge in 0.16s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1950
[LightGBM] [Info] Number of data points in the train set: 667264, number of used features: 32
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1950
[LightGBM] [Info] Number of data points in the train set: 667264, number of used features: 32
Refit lgbm in 70.48s
Refit all models in 475.69s
Refit lgbm in 70.48s
Refit all models in 475.69s


In [12]:
# 11) Inference on Test, Post-process
with Timer() as t:
    preds_test_members = {}
    for name, model in fit_models_full.items():
        preds_test_members[name] = model.predict(Xt_test)
    # Blend
    order = list(fit_models_full.keys())
    w = np.array([best_combo.get(m, 0.0) for m in order])
    w = w / (w.sum() if w.sum() > 0 else 1.0)
    Ptest = np.vstack([preds_test_members[m] for m in order]).T
    y_test_pred = (Ptest * w).sum(axis=1)

    # Round to integer and optional clip
    y_min, y_max = int(np.floor(y['p0q0'].min())), int(np.ceil(y['p0q0'].max()))
    do_clip = True
    y_submit = np.rint(y_test_pred)
    if do_clip:
        y_submit = np.clip(y_submit, y_min, y_max)

print(f"Inference + post-process in {t.dt:.2f}s. Pred range before rounding: [{y_test_pred.min():.2f}, {y_test_pred.max():.2f}] -> after: [{y_submit.min():.0f}, {y_submit.max():.0f}] with clip to [{y_min},{y_max}]")

Inference + post-process in 1.19s. Pred range before rounding: [-3.00, 3.02] -> after: [-3, 3] with clip to [-160,15]


In [15]:
# 12) Write Submission File y_test.csv
from pathlib import Path

out_path = Path(data_dir) / 'y_test.csv'
sub = pd.DataFrame({'p0q0': y_submit.astype(int)})
sub.to_csv(out_path, index=True, index_label='id')
print(f"Wrote submission to: {out_path}")
print(sub.head())

Wrote submission to: \\wsl.localhost\Ubuntu-24.04\home\utharushan\ChallengeData\SNCF-Transilien-challenge\y_test.csv
   p0q0
0     0
1     0
2     0
3     0
4     0


In [14]:
# 13) Assertions and runtime reporting
assert len(sub) == len(X_test), (len(sub), len(X_test))
assert np.allclose(sub['p0q0'].values, np.rint(sub['p0q0']).values), "Submission must be integers after rounding"

# Print per-model MAE and ensemble
for name, pred in val_preds.items():
    print(f"Valid MAE {name}: {mean_absolute_error(y_va, pred):.4f}")
print(f"Ensemble MAE: {best_mae:.4f}")
print("Weights:", best_combo)

print("Done.")

Valid MAE hgb: 1.4205
Valid MAE rf: 1.7267
Valid MAE et: 1.5084
Valid MAE ridge: 1.5388
Valid MAE lgbm: 1.3514
Ensemble MAE: 1.3514
Weights: {'hgb': 0.0, 'rf': 0.0, 'et': 0.0, 'ridge': 0.0, 'lgbm': 1.0}
Done.
