# SNCF Transilien: Fast Ensemble for p0q0 (MAE)

This notebook trains a fast ensemble to predict p0q0 (difference between theoretical and realized waiting time) and exports `y_test.csv`. It prioritizes speed and competitive MAE using simple, leakage-safe features and an efficient blend of models.

In [2]:
# Imports and robust data_dir resolution
import os
from pathlib import Path
import warnings

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

# Reproducibility
RANDOM_STATE = 42

# LightGBM availability (optional fast model)
try:
    import lightgbm as lgb
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

# --- Robust path detection ---
# Prefer forward-slash UNC to avoid backslash escaping issues on Windows
candidate_dirs = [
    Path('//wsl.localhost/Ubuntu-24.04/home/utharushan/ChallengeData/SNCF-Transilien-challenge'),
    Path.cwd(),
]

required_files = {'x_train_final.csv', 'x_test_final.csv'}

chosen: Path | None = None
for cand in candidate_dirs:
    try:
        if cand.exists() and cand.is_dir():
            present = {p.name for p in cand.iterdir() if p.is_file()}
            if required_files.issubset(present):
                chosen = cand
                break
    except OSError:
        # Inaccessible path (e.g., network/unmounted), skip
        continue

if chosen is None:
    # If not all required files found, still try to pick a dir that at least exists
    for cand in candidate_dirs:
        if cand.exists() and cand.is_dir():
            chosen = cand
            break

if chosen is None:
    raise FileNotFoundError(
        'Aucun répertoire de données valide trouvé. Vérifiez le chemin des fichiers CSV. '
        'Essayé: ' + ' | '.join(str(p) for p in candidate_dirs)
    )

# Ensure the two required files exist here
missing = [fname for fname in required_files if not (chosen / fname).exists()]
if missing:
    raise FileNotFoundError(
        f"Fichiers manquants dans {chosen}: {missing}.\n"
        "Assurez-vous que 'x_train_final.csv' et 'x_test_final.csv' sont bien présents."
    )

# Select y_train file by pattern
y_candidates = sorted((p for p in chosen.glob('y_train*.csv')), key=lambda p: p.name)
if not y_candidates:
    raise FileNotFoundError(
        f"Aucun fichier y_train* trouvé dans {chosen}. Placez le fichier de vérité terrain dans ce dossier."
    )

# Export variables used later
data_dir = str(chosen)
train_path = os.path.join(data_dir, 'x_train_final.csv')
test_path = os.path.join(data_dir, 'x_test_final.csv')
y_train_path = str(y_candidates[0])

print(f"Using data_dir: {data_dir}")
print(f"Detected y_train file: {Path(y_train_path).name}")

Using data_dir: \\wsl.localhost\Ubuntu-24.04\home\utharushan\ChallengeData\SNCF-Transilien-challenge
Detected y_train file: y_train_final_j5KGWWK.csv


In [3]:
# 2) Load Data (train/test) from CSV
with Timer() as t:
    X = pd.read_csv(train_path)
    X_test = pd.read_csv(test_path)
    y = pd.read_csv(y_train_path)
print(f"Loaded data in {t.dt:.2f}s: X={X.shape}, X_test={X_test.shape}, y={y.shape}")

# Align and clean columns
# Remove potential unnamed index columns
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]

# y column may be named p0q0 or similar; ensure it's named 'p0q0'
if y.shape[1] == 1:
    y.columns = ['p0q0']
else:
    # If there's an index + a single target col, take the last
    y = y.iloc[:, [-1]]
    y.columns = ['p0q0']

# Coerce dtypes
for col in ['train', 'gare']:
    if col in X.columns:
        X[col] = X[col].astype(str)
        if col in X_test.columns:
            X_test[col] = X_test[col].astype(str)

if 'date' in X.columns:
    X['date'] = pd.to_datetime(X['date'])
if 'date' in X_test.columns:
    X_test['date'] = pd.to_datetime(X_test['date'])

if 'arret' in X.columns:
    X['arret'] = X['arret'].astype(int)
if 'arret' in X_test.columns:
    X_test['arret'] = X_test['arret'].astype(int)

lag_cols = ['p2q0','p3q0','p4q0','p0q2','p0q3','p0q4']
for c in lag_cols:
    if c in X.columns:
        X[c] = X[c].astype(float)
    if c in X_test.columns:
        X_test[c] = X_test[c].astype(float)

# Ensure train/test columns match except target
feature_cols = [c for c in X.columns if c != 'p0q0']
print("Train columns:", feature_cols)
print("Test columns:", list(X_test.columns))

Loaded data in 0.81s: X=(667264, 12), X_test=(20657, 11), y=(667264, 2)
Train columns: ['train', 'gare', 'date', 'arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4']
Test columns: ['Unnamed: 0', 'train', 'gare', 'date', 'arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4']


In [4]:
# 3) Basic Data Checks and Sanity Validations
print(X.head(3))
print(X_test.head(3))
print("y head:\n", y.head())

print("Missing in X:\n", X.isna().sum())
print("Missing in X_test:\n", X_test.isna().sum())

# Sanity: ensure target length matches X rows
assert len(y) == len(X), (len(y), len(X))

# Unique counts
for col in ['train','gare']:
    if col in X.columns:
        print(f"Unique {col}: train={X[col].nunique()} test={X_test[col].nunique()}")

if 'date' in X.columns:
    print("Date range train:", X['date'].min(), X['date'].max())
    print("Date range test:", X_test['date'].min(), X_test['date'].max())

print('Target stats:', y['p0q0'].describe())

    train gare       date  arret  p2q0  p3q0  p4q0  p0q2  p0q3  p0q4
0  VBXNMF  KYF 2023-04-03      8   0.0   0.0   1.0  -3.0  -1.0  -2.0
1  VBXNMF  JLR 2023-04-03      9   0.0   0.0   0.0   1.0   0.0   1.0
2  VBXNMF  EOH 2023-04-03     10  -1.0   0.0   0.0  -1.0   0.0   0.0
   Unnamed: 0   train gare       date  arret  p2q0  p3q0  p4q0  p0q2  p0q3  p0q4
0           0  ZPQEKP  VXY 2023-11-13     12   0.0   0.0  -2.0  -4.0  -2.0  -4.0
1           1  KIQSRA  VXY 2023-11-13     12   0.0   0.0  -1.0   1.0  -1.0   0.0
2           2  QQJYYT  VXY 2023-11-13     12   0.0   1.0  -1.0   1.0  -1.0   1.0
y head:
    p0q0
0  -1.0
1  -1.0
2  -1.0
3   1.0
4   3.0
Missing in X:
 train    0
gare     0
date     0
arret    0
p2q0     0
p3q0     0
p4q0     0
p0q2     0
p0q3     0
p0q4     0
dtype: int64
Missing in X_test:
 Unnamed: 0    0
train         0
gare          0
date          0
arret         0
p2q0          0
p3q0          0
p4q0          0
p0q2          0
p0q3          0
p0q4          0
dtype: in

In [16]:
# 3) Feature engineering helpers and application
from contextlib import contextmanager
import time

@contextmanager
def Timer(msg: str):
    t0 = time.time()
    yield
    print(f"{msg} in {time.time()-t0:.2f}s")

def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d['year'] = d['date'].dt.year.astype('int16')
    d['month'] = d['date'].dt.month.astype('int8')
    d['day'] = d['date'].dt.day.astype('int8')
    d['weekday'] = d['date'].dt.weekday.astype('int8')
    d['weekofyear'] = d['date'].dt.isocalendar().week.astype('int16')
    d['dayofyear'] = d['date'].dt.dayofyear.astype('int16')
    # Cyclical encodings for seasonality
    for col, mod in [('month', 12), ('weekday', 7), ('weekofyear', 53)]:
        ang = 2 * np.pi * (d[col].astype(float) / mod)
        d[f'{col}_sin'] = np.sin(ang).astype('float32')
        d[f'{col}_cos'] = np.cos(ang).astype('float32')
    return d

def add_gare_aggregates(train_df: pd.DataFrame, test_df: pd.DataFrame, keys=('gare',)):
    # Leakage-safe aggregates computed only from train
    cols = ['p2q0','p3q0','p4q0','p0q2','p0q3','p0q4']
    aggs = {}
    for c in cols:
        aggs[c] = ['mean','median','std','min','max']
    agg_df = train_df.groupby(list(keys))[cols].agg(aggs)
    agg_df.columns = ['_'.join([c, stat]) for c, stat in agg_df.columns]
    agg_df = agg_df.reset_index()
    for df in (train_df, test_df):
        df = df.merge(agg_df, on=list(keys), how='left')
    return train_df.merge(agg_df, on=list(keys), how='left'), test_df.merge(agg_df, on=list(keys), how='left'), agg_df

def add_lag_interactions(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    lags = ['p2q0','p3q0','p4q0','p0q2','p0q3','p0q4']
    # Absolute values and signs
    for c in lags:
        d[f'{c}_abs'] = d[c].abs().astype('float32')
        d[f'{c}_sign'] = np.sign(d[c]).astype('int8')
    # Pairwise sums and differences (limited to a few to avoid explosion)
    pairs = [('p2q0','p3q0'), ('p3q0','p4q0'), ('p0q2','p0q3'), ('p0q3','p0q4')]
    for a,b in pairs:
        d[f'{a}_plus_{b}'] = (d[a] + d[b]).astype('float32')
        d[f'{a}_minus_{b}'] = (d[a] - d[b]).astype('float32')
    return d

with Timer('Feature engineering'):
    X_fe = add_date_features(X)
    X_test_fe = add_date_features(X_test)
    X_fe, X_test_fe, agg_df = add_gare_aggregates(X_fe, X_test_fe, keys=('gare',))
    X_fe = add_lag_interactions(X_fe)
    X_test_fe = add_lag_interactions(X_test_fe)

# Build feature/target matrices
all_cols = [c for c in X_fe.columns if c not in ['date']]
feature_cols = [c for c in all_cols if c != 'p0q0']
lag_cols = ['p2q0','p3q0','p4q0','p0q2','p0q3','p0q4']

categorical_cols = ['train','gare']
numeric_cols = [c for c in feature_cols if c not in categorical_cols]

print(f"Numeric cols ( {len(numeric_cols)} ): {numeric_cols[:20]} ...")
print(f"Categorical cols ( {len(categorical_cols)} ): {categorical_cols}")

Feature engineering in 2.65s
Numeric cols ( 87 ): ['arret', 'p2q0', 'p3q0', 'p4q0', 'p0q2', 'p0q3', 'p0q4', 'year', 'month', 'day', 'weekday', 'weekofyear', 'dayofyear', 'p2q0_mean_x', 'p2q0_median_x', 'p2q0_std_x', 'p3q0_mean_x', 'p3q0_median_x', 'p3q0_std_x', 'p4q0_mean_x'] ...
Categorical cols ( 2 ): ['train', 'gare']


In [17]:
# 4) Preprocessing and date-aware split
from sklearn.pipeline import Pipeline

# Update preprocessing with new features
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='drop'
)

# Use the last day as validation; also prepare for CV later
last_date = X_fe['date'].max()
valid_mask = X_fe['date'] == last_date
train_mask = ~valid_mask

X_tr, X_va = X_fe.loc[train_mask, feature_cols], X_fe.loc[valid_mask, feature_cols]
y_tr, y_va = y.loc[train_mask, 'p0q0'].values, y.loc[valid_mask, 'p0q0'].values

# No downsampling now; we go full training for stronger models
with Timer('Preprocessing fit+transform'):
    Xt_tr = preprocessor.fit_transform(X_tr)
    Xt_va = preprocessor.transform(X_va)
    Xt_full = preprocessor.fit_transform(X_fe[feature_cols])
    Xt_test = preprocessor.transform(X_test_fe[feature_cols])

print(
    f"Split: time-based split by date, valid_date={last_date.date()}\n"
    f"Shapes: tr={Xt_tr.shape}, va={Xt_va.shape}, full={Xt_full.shape}, test={Xt_test.shape}"
)

Preprocessing fit+transform in 14.18s
Split: time-based split by date, valid_date=2023-11-10
Shapes: tr=(660238, 89), va=(7026, 89), full=(667264, 89), test=(20657, 89)


In [7]:
# 6) Train/Validation Split (date-aware)
# If multiple dates, use the latest date as validation; else k-fold fallback
if 'date' in X.columns and X['date'].nunique() > 1:
    last_date = X['date'].max()
    train_mask = X['date'] < last_date
    valid_mask = X['date'] == last_date
    X_tr, X_va = X.loc[train_mask].copy(), X.loc[valid_mask].copy()
    y_tr, y_va = y.loc[train_mask, 'p0q0'].values, y.loc[valid_mask, 'p0q0'].values
    split_desc = f"time-based split by date, valid_date={last_date.date()}"
else:
    X_tr, X_va, y_tr, y_va = train_test_split(X, y['p0q0'].values, test_size=0.1, random_state=RANDOM_STATE)
    split_desc = "random 90/10 split (no multiple dates)"

# Optional downsampling of training rows for speed if extremely large
MAX_TRAIN_ROWS = 250_000  # cap to ensure quick training while keeping good signal
if len(X_tr) > MAX_TRAIN_ROWS:
    idx = np.random.RandomState(RANDOM_STATE).choice(len(X_tr), size=MAX_TRAIN_ROWS, replace=False)
    X_tr = X_tr.iloc[idx]
    y_tr = y_tr[idx]
    print(f"Downsampled training split to {len(X_tr):,} rows for speed")

print("Split:", split_desc)

with Timer() as t:
    Xt_tr = preprocessor.fit_transform(X_tr)
    Xt_va = preprocessor.transform(X_va)
    Xt_full = preprocessor.fit_transform(X)  # for refit later
    Xt_test = preprocessor.transform(X_test)
print(f"Preprocessing fit+transform in {t.dt:.2f}s; Shapes: tr={Xt_tr.shape}, va={Xt_va.shape}, full={Xt_full.shape}, test={Xt_test.shape}")

Downsampled training split to 250,000 rows for speed
Split: time-based split by date, valid_date=2023-11-10
Preprocessing fit+transform in 2.80s; Shapes: tr=(250000, 33), va=(7026, 33), full=(667264, 33), test=(20657, 33)
Preprocessing fit+transform in 2.80s; Shapes: tr=(250000, 33), va=(7026, 33), full=(667264, 33), test=(20657, 33)


In [18]:
# 6) Define models (stronger configs) and containers
models = []
# HistGradientBoosting tuned for MAE
models.append(('hgb', HistGradientBoostingRegressor(
    loss='absolute_error', learning_rate=0.05, max_depth=7, max_iter=400,
    l2_regularization=0.0, early_stopping=True, random_state=RANDOM_STATE
)))
# RandomForest, deeper with more trees
models.append(('rf', RandomForestRegressor(
    n_estimators=600, max_depth=18, min_samples_leaf=1, n_jobs=-1,
    bootstrap=True, random_state=RANDOM_STATE
)))
# ExtraTrees, strong
models.append(('et', ExtraTreesRegressor(
    n_estimators=600, max_depth=22, min_samples_leaf=1, n_jobs=-1,
    bootstrap=False, random_state=RANDOM_STATE
)))
# Ridge as a linear baseline on engineered features
models.append(('ridge', Ridge(alpha=1.0, random_state=RANDOM_STATE)))

# Optional LightGBM
if HAS_LGBM:
    model = lgb.LGBMRegressor(
        objective='mae',
        learning_rate=0.05,
        n_estimators=3000,
        max_depth=-1,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=0.0,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    models.append(('lgbm', model))

model_names = [name for name, _ in models]
print('Models:', model_names)

Models: ['hgb', 'rf', 'et', 'ridge', 'lgbm']


In [21]:
# 7) Time-based CV (lighter): only HGB + LGBM, 2 folds, OOF + test averaging
from sklearn.model_selection import TimeSeriesSplit

# Keep CV light: only strongest, fastest models
cv_model_names = [name for name, _ in models if name in ('hgb', 'lgbm') and (name != 'lgbm' or HAS_LGBM)]

n_splits = 2
cv = TimeSeriesSplit(n_splits=n_splits)

val_preds = {name: np.zeros_like(y_va, dtype=float) for name in cv_model_names}
P = {name: np.zeros(Xt_full.shape[0], dtype=float) for name in cv_model_names}
Ptest = {name: np.zeros(Xt_test.shape[0], dtype=float) for name in cv_model_names}

# Prepare global categorical categories to ensure matching across train/valid/test (for LGBM)
cat_categories = {}
for c in categorical_cols:
    cats = pd.Categorical(pd.concat([
        X_fe[c].astype('string'),
        X_test_fe[c].astype('string')
    ], ignore_index=True)).categories
    cat_categories[c] = cats

def set_cats(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    for c in categorical_cols:
        d[c] = pd.Categorical(d[c].astype('string'), categories=cat_categories[c], ordered=False)
    return d

# Fit on full matrix, but we’ll create folds using training rows indices
train_idx = np.where(train_mask.values)[0]
valid_idx = np.where(valid_mask.values)[0]

with Timer(f'{n_splits}-fold time CV (HGB+LGBM)'):
    for fold, (tr_idx_rel, va_idx_rel) in enumerate(cv.split(Xt_tr), 1):
        tr_idx = train_idx[tr_idx_rel]
        va_idx = train_idx[va_idx_rel]
        Xtr, Xva = Xt_full[tr_idx], Xt_full[va_idx]
        ytr, yva = y['p0q0'].values[tr_idx], y['p0q0'].values[va_idx]
        for name, model in models:
            if name not in cv_model_names:
                continue
            if name == 'lgbm' and HAS_LGBM:
                # Fit LightGBM on raw engineered pandas with consistent categories
                train_pd = set_cats(X_fe.iloc[tr_idx][feature_cols])
                valid_pd = set_cats(X_fe.iloc[va_idx][feature_cols])
                test_pd = set_cats(X_test_fe[feature_cols])
                model.set_params(n_estimators=2000)  # keep CV fast
                model.fit(
                    train_pd, ytr,
                    eval_set=[(valid_pd, yva)],
                    eval_metric='l1',
                    categorical_feature=categorical_cols,
                    callbacks=[lgb.early_stopping(50, verbose=False)]
                )
                pv = model.predict(valid_pd)
                pt = model.predict(test_pd)
            else:
                # Sklearn path using preprocessed arrays
                model.fit(Xtr, ytr)
                pv = model.predict(Xva)
                pt = model.predict(Xt_test)
            P[name][va_idx] = pv
            Ptest[name] += pt / n_splits

# Validation slice predictions (last-date indices)
for name in cv_model_names:
    val_preds[name] = P[name][valid_idx]
    print(f"Model {name} val MAE: {mean_absolute_error(y_va, val_preds[name]):.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047828 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15632
[LightGBM] [Info] Number of data points in the train set: 220080, number of used features: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047828 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15632
[LightGBM] [Info] Number of data points in the train set: 220080, number of used features: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26711
[LightGBM] [Info] Number of data points in the train set: 440159, number of used features: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084127 seconds.
You can set `force_col_

In [22]:
# 8) Blend OOF/validation predictions: simplex grid + meta stacking (optional)
from itertools import product

# Restrict to models actually validated
order = list(val_preds.keys())
mae = lambda w: mean_absolute_error(y_va, sum(w[i]*val_preds[name] for i, name in enumerate(order)))

best_w = None
best_mae = 1e9
step = 0.1

# Simplex grid (sum to 1)
for w in product(np.arange(0, 1+1e-9, step), repeat=len(order)):
    if abs(sum(w) - 1.0) > 1e-6:
        continue
    m = mae(w)
    if m < best_mae:
        best_mae = m
        best_w = np.array(w)

print('Best ensemble MAE=', round(best_mae, 4))
print('Selected weights:', {name: float(best_w[i]) for i, name in enumerate(order)})
print('MAE definition: MAE = (1/n) * sum(|y_i - yhat_i|)')

# Optional: simple meta-learner on validation slice
try:
    from sklearn.linear_model import HuberRegressor
    meta = HuberRegressor(alpha=0.0005)
    Z_va = np.column_stack([val_preds[name] for name in order])
    meta.fit(Z_va, y_va)
    meta_pred = meta.predict(Z_va)
    meta_mae = mean_absolute_error(y_va, meta_pred)
    print(f'Meta-learner (Huber) MAE on val: {meta_mae:.4f}')
    use_meta = meta_mae < best_mae
except Exception:
    use_meta = False


Best ensemble MAE= 1.5915
Selected weights: {'hgb': 0.0, 'lgbm': 1.0}
MAE definition: MAE = (1/n) * sum(|y_i - yhat_i|)
Meta-learner (Huber) MAE on val: 1.6300


In [24]:
# 9) Refit contributing models on full data and predict test (aligned with selection)
fit_models_full = {}
preds_test_members = {}

if use_meta:
    Z_test = np.column_stack([Ptest[name] for name in order])
    meta_full = HuberRegressor(alpha=0.0005)
    meta_full.fit(Z_va, y_va)
    y_test_pred = meta_full.predict(Z_test)
    preds_test_members = {name: Ptest[name] for name in order}
else:
    contrib = [name for i,name in enumerate(order) if best_w[i] > 0]
    for name, model in models:
        if name not in contrib:
            continue
        if name == 'lgbm' and HAS_LGBM:
            train_pd = set_cats(X_fe[feature_cols].copy())
            model.fit(train_pd, y['p0q0'].values, categorical_feature=categorical_cols)
            test_pd = set_cats(X_test_fe[feature_cols].copy())
            preds_test_members[name] = model.predict(test_pd)
        elif name == 'hgb':
            model.fit(Xt_full, y['p0q0'].values)
            preds_test_members[name] = model.predict(Xt_test)
    y_test_pred = np.zeros(Xt_test.shape[0], dtype=float)
    for i, name in enumerate(order):
        if best_w[i] > 0:
            y_test_pred += best_w[i] * preds_test_members[name]

print('Contributing members:', [k for k in preds_test_members.keys()])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35442
[LightGBM] [Info] Number of data points in the train set: 667264, number of used features: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35442
[LightGBM] [Info] Number of data points in the train set: 667264, number of used features: 88
Contributing members: ['lgbm']
Contributing members: ['lgbm']


In [25]:
# 10) Post-processing and prepare submission
# Round to nearest int; clip to observed training range
y_min, y_max = int(np.floor(y['p0q0'].min())), int(np.ceil(y['p0q0'].max()))
y_submit = np.rint(y_test_pred).astype(int)
do_clip = True
if do_clip:
    y_submit = np.clip(y_submit, y_min, y_max)

print(f"Pred range before rounding: [{y_test_pred.min():.2f}, {y_test_pred.max():.2f}] -> after: [{y_submit.min()}, {y_submit.max()}] with clip to [{y_min},{y_max}]")

Pred range before rounding: [-4.08, 2.98] -> after: [-4, 3] with clip to [-160,15]


In [26]:
# 12) Write Submission File y_test.csv
from pathlib import Path

out_path = Path(data_dir) / 'y_test.csv'
sub = pd.DataFrame({'p0q0': y_submit.astype(int)})
sub.to_csv(out_path, index=True, index_label='id')
print(f"Wrote submission to: {out_path}")
print(sub.head())

Wrote submission to: \\wsl.localhost\Ubuntu-24.04\home\utharushan\ChallengeData\SNCF-Transilien-challenge\y_test.csv
   p0q0
0     0
1     0
2     0
3     0
4     0


In [None]:
# 13) Logs
for name in model_names:
    if name in val_preds:
        print(f"Valid MAE {name}: {mean_absolute_error(y_va, val_preds[name]):.4f}")
print(f"Ensemble MAE: {best_mae:.4f}")
print(f"Weights: {{name: float(best_w[i]) for i, name in enumerate(model_names)}}")
print('Meta used:', use_meta)
print('Done.')

Valid MAE hgb: 1.4205
Valid MAE rf: 1.7267
Valid MAE et: 1.5084
Valid MAE ridge: 1.5388
Valid MAE lgbm: 1.3514
Ensemble MAE: 1.3514
Weights: {'hgb': 0.0, 'rf': 0.0, 'et': 0.0, 'ridge': 0.0, 'lgbm': 1.0}
Done.
