# Shinkansen Passenger Satisfaction — V2

**Goal:** Beat 0.9584855 accuracy (1st place). Current best: 0.9555643 (3rd place).

**Strategy:** Optuna-tuned LightGBM + XGBoost + CatBoost → stacking ensemble → threshold optimization → seed averaging.

In [1]:
# Cell 1: Imports & Configuration

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy import stats

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

DATA_DIR = '.'

# --- GPU Configuration ---
# Set USE_GPU = False if you don't have a CUDA-capable GPU.
# LightGBM stays on CPU (pip build lacks CUDA support).
# XGBoost and CatBoost use GPU when available.
USE_GPU = True

XGB_DEVICE = {'device': 'cuda'} if USE_GPU else {}
CB_DEVICE = {'task_type': 'GPU'} if USE_GPU else {}

print(f'GPU enabled: {USE_GPU}')
print('Imports OK')

GPU enabled: True
Imports OK


In [2]:
# Cell 2: Data Loading & Merging

travel_train = pd.read_csv(f'{DATA_DIR}/Traveldata_train_(1).csv')
survey_train = pd.read_csv(f'{DATA_DIR}/Surveydata_train_(1).csv')
travel_test = pd.read_csv(f'{DATA_DIR}/Traveldata_test_(1).csv')
survey_test = pd.read_csv(f'{DATA_DIR}/Surveydata_test_(1).csv')

train = travel_train.merge(survey_train, on='ID', how='inner')
test = travel_test.merge(survey_test, on='ID', how='inner')

target_col = 'Overall_Experience'
y_train = train[target_col].values
test_ids = test['ID'].values

print(f'Train: {train.shape}, Test: {test.shape}')
print(f'Target distribution: {pd.Series(y_train).value_counts(normalize=True).to_dict()}')
print(f'\nMissing values (train):')
missing = train.isnull().sum()
print(missing[missing > 0].sort_values(ascending=False))

Train: (94379, 25), Test: (35602, 24)
Target distribution: {1: 0.5466576251072802, 0: 0.4533423748927198}

Missing values (train):
Type_Travel                9226
Customer_Type              8951
Arrival_Time_Convenient    8930
Catering                   8741
Onboard_Service            7601
Arrival_Delay_in_Mins       357
Baggage_Handling            142
Online_Support               91
Legroom                      90
Gender                       77
CheckIn_Service              77
Ease_of_Online_Booking       73
Seat_Comfort                 61
Departure_Delay_in_Mins      57
Age                          33
Platform_Location            30
Onboard_Wifi_Service         30
Onboard_Entertainment        18
Cleanliness                   6
Online_Boarding               6
dtype: int64


In [3]:
# Cell 3: Encoding

# Ordinal survey columns (same scale)
survey_cols = [
    'Seat_Comfort', 'Arrival_Time_Convenient', 'Catering',
    'Onboard_Wifi_Service', 'Onboard_Entertainment', 'Online_Support',
    'Ease_of_Online_Booking', 'Onboard_Service', 'Legroom',
    'Baggage_Handling', 'CheckIn_Service', 'Cleanliness', 'Online_Boarding'
]
ordinal_map = {
    'Extremely Poor': 0, 'Poor': 1, 'Needs Improvement': 2,
    'Acceptable': 3, 'Good': 4, 'Excellent': 5
}

# Platform_Location has its own scale
platform_map = {
    'Very Inconvenient': 0, 'Inconvenient': 1, 'Needs Improvement': 2,
    'Manageable': 3, 'Convenient': 4, 'Very Convenient': 5
}

# Nominal categoricals
nominal_cols = ['Gender', 'Customer_Type', 'Type_Travel', 'Travel_Class', 'Seat_Class']

def encode_data(df):
    df = df.copy()
    for col in survey_cols:
        df[col] = df[col].map(ordinal_map)
    df['Platform_Location'] = df['Platform_Location'].map(platform_map)
    return df

train_enc = encode_data(train)
test_enc = encode_data(test)

# Label encode nominal categoricals (fit on combined to handle unseen labels)
label_encoders = {}
for col in nominal_cols:
    le = LabelEncoder()
    combined = pd.concat([train_enc[col], test_enc[col]], axis=0).astype(str)
    le.fit(combined)
    train_enc[col] = le.transform(train_enc[col].astype(str))
    test_enc[col] = le.transform(test_enc[col].astype(str))
    label_encoders[col] = le

# Restore NaN where original was NaN (LabelEncoder converts 'nan' to an int)
for col in nominal_cols:
    train_enc.loc[train[col].isna(), col] = np.nan
    test_enc.loc[test[col].isna(), col] = np.nan

print('Encoding done')

Encoding done


In [4]:
# Cell 4: Enhanced Feature Engineering

all_survey_cols = survey_cols + ['Platform_Location']  # all 14 ordinal columns

def engineer_features(df):
    df = df.copy()
    
    survey_data = df[all_survey_cols]
    
    # --- V1 features ---
    df['survey_mean'] = survey_data.mean(axis=1)
    df['survey_std'] = survey_data.std(axis=1)
    df['survey_min'] = survey_data.min(axis=1)
    df['survey_max'] = survey_data.max(axis=1)
    df['survey_range'] = df['survey_max'] - df['survey_min']
    df['high_ratings_count'] = (survey_data >= 4).sum(axis=1)
    df['low_ratings_count'] = (survey_data <= 1).sum(axis=1)
    df['total_delay'] = df['Departure_Delay_in_Mins'].fillna(0) + df['Arrival_Delay_in_Mins'].fillna(0)
    df['delay_diff'] = df['Arrival_Delay_in_Mins'].fillna(0) - df['Departure_Delay_in_Mins'].fillna(0)
    df['has_delay'] = ((df['Departure_Delay_in_Mins'].fillna(0) > 0) | (df['Arrival_Delay_in_Mins'].fillna(0) > 0)).astype(int)
    
    # --- V2 new features ---
    df['survey_median'] = survey_data.median(axis=1)
    df['survey_skew'] = survey_data.apply(lambda row: row.dropna().skew() if row.dropna().shape[0] >= 3 else 0, axis=1)
    df['mid_ratings_count'] = ((survey_data == 2) | (survey_data == 3)).sum(axis=1)
    
    # Delay-to-distance ratio
    df['delay_to_distance'] = df['total_delay'] / (df['Travel_Distance'] + 1)
    
    # Age quantile bins
    df['age_bin'] = pd.qcut(df['Age'], q=10, labels=False, duplicates='drop')
    
    # Missing value indicators (for columns with >1% missing)
    for col in df.columns:
        if df[col].isna().mean() > 0.01:
            df[f'is_missing_{col}'] = df[col].isna().astype(int)
    
    # Interaction features
    df['TypeTravel_x_TravelClass'] = df['Type_Travel'].fillna(-1).astype(int) * 10 + df['Travel_Class'].fillna(-1).astype(int)
    df['CustType_x_SeatClass'] = df['Customer_Type'].fillna(-1).astype(int) * 10 + df['Seat_Class'].fillna(-1).astype(int)
    
    # Survey sum (total satisfaction score)
    df['survey_sum'] = survey_data.sum(axis=1)
    
    return df

train_fe = engineer_features(train_enc)
test_fe = engineer_features(test_enc)

# Drop non-feature columns
drop_cols = ['ID', target_col]
feature_cols = [c for c in train_fe.columns if c not in drop_cols]

X_train = train_fe[feature_cols].values.astype(np.float32)
X_test = test_fe[feature_cols].values.astype(np.float32)

print(f'Features: {len(feature_cols)}')
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')
print(f'\nNew feature columns:')
v2_features = ['survey_median', 'survey_skew', 'mid_ratings_count', 'delay_to_distance',
               'age_bin', 'TypeTravel_x_TravelClass', 'CustType_x_SeatClass', 'survey_sum']
print([c for c in feature_cols if any(c.startswith(v) for v in v2_features + ['is_missing_'])])

Features: 46
X_train: (94379, 46), X_test: (35602, 46)

New feature columns:
['survey_median', 'survey_skew', 'mid_ratings_count', 'delay_to_distance', 'age_bin', 'is_missing_Customer_Type', 'is_missing_Type_Travel', 'is_missing_Arrival_Time_Convenient', 'is_missing_Catering', 'is_missing_Onboard_Service', 'TypeTravel_x_TravelClass', 'CustType_x_SeatClass', 'survey_sum']


In [5]:
# Cell 5: Baseline — Reproduce V1 LightGBM

v1_params = {
    'n_estimators': 2000,
    'learning_rate': 0.03,
    'max_depth': 7,
    'num_leaves': 63,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_samples': 30,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': SEED,
    'verbose': -1,
    'n_jobs': -1
}

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
baseline_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    model = lgb.LGBMClassifier(**v1_params)
    model.fit(X_tr, y_tr)
    
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    baseline_scores.append(acc)
    print(f'Fold {fold+1}: {acc:.6f}')

print(f'\nV1 Baseline CV: {np.mean(baseline_scores):.6f} ± {np.std(baseline_scores):.6f}')

Fold 1: 0.953751
Fold 2: 0.956082
Fold 3: 0.957512
Fold 4: 0.956453
Fold 5: 0.955550

V1 Baseline CV: 0.955869 ± 0.001239


In [6]:
# Cell 6: Optuna Hyperparameter Tuning — LightGBM

def lgb_objective(trial):
    params = {
        'n_estimators': 5000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'num_leaves': trial.suggest_int('num_leaves', 15, 127),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'random_state': SEED,
        'verbose': -1,
        'n_jobs': -1
    }
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)]
        )
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))
    
    return np.mean(scores)

study_lgb = optuna.create_study(direction='maximize', study_name='lgb')
study_lgb.optimize(lgb_objective, n_trials=100, show_progress_bar=True)

print(f'\nBest LightGBM CV: {study_lgb.best_value:.6f}')
print(f'Best params: {study_lgb.best_params}')

best_lgb_params = study_lgb.best_params
best_lgb_params.update({'n_estimators': 5000, 'random_state': SEED, 'verbose': -1, 'n_jobs': -1})

Best trial: 52. Best value: 0.957385: 100%|██████████| 100/100 [55:24<00:00, 33.24s/it]


Best LightGBM CV: 0.957385
Best params: {'learning_rate': 0.02006130404098335, 'max_depth': 10, 'num_leaves': 82, 'subsample': 0.6363020221407356, 'colsample_bytree': 0.6163799680541543, 'min_child_samples': 14, 'reg_alpha': 0.4959859942324878, 'reg_lambda': 0.028694549312225034}





In [7]:
# Cell 7: Optuna Hyperparameter Tuning — XGBoost (GPU-accelerated)

def xgb_objective(trial):
    params = {
        'n_estimators': 5000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'random_state': SEED,
        'eval_metric': 'error',
        'verbosity': 0,
        'n_jobs': -1,
        'tree_method': 'hist',
        **XGB_DEVICE,  # GPU: adds device='cuda' when USE_GPU=True
    }
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))
    
    return np.mean(scores)

study_xgb = optuna.create_study(direction='maximize', study_name='xgb')
study_xgb.optimize(xgb_objective, n_trials=100, show_progress_bar=True)

print(f'\nBest XGBoost CV: {study_xgb.best_value:.6f}')
print(f'Best params: {study_xgb.best_params}')

best_xgb_params = study_xgb.best_params
best_xgb_params.update({'n_estimators': 5000, 'random_state': SEED, 'eval_metric': 'error', 'verbosity': 0, 'n_jobs': -1, 'tree_method': 'hist', **XGB_DEVICE})

Best trial: 81. Best value: 0.957321: 100%|██████████| 100/100 [59:09<00:00, 35.49s/it]


Best XGBoost CV: 0.957321
Best params: {'learning_rate': 0.02629765461385586, 'max_depth': 10, 'subsample': 0.933787944562583, 'colsample_bytree': 0.8418629840543196, 'min_child_weight': 2, 'reg_alpha': 0.015500510498064968, 'reg_lambda': 0.0024044203758951436, 'gamma': 0.38915388203943474}





In [8]:
# Cell 8: Optuna Hyperparameter Tuning — CatBoost (GPU-accelerated)

def cb_objective(trial):
    params = {
        'iterations': 5000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 5.0),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10.0, log=True),
        'random_seed': SEED,
        'verbose': 0,
        'allow_writing_files': False,
        'eval_metric': 'Accuracy',
        'early_stopping_rounds': 100,
        **CB_DEVICE,  # GPU: adds task_type='GPU' when USE_GPU=True
    }
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=(X_val, y_val),
            verbose=0
        )
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))
    
    return np.mean(scores)

study_cb = optuna.create_study(direction='maximize', study_name='catboost')
study_cb.optimize(cb_objective, n_trials=75, show_progress_bar=True)

print(f'\nBest CatBoost CV: {study_cb.best_value:.6f}')
print(f'Best params: {study_cb.best_params}')

best_cb_params = study_cb.best_params
best_cb_params.update({'iterations': 5000, 'random_seed': SEED, 'verbose': 0, 'allow_writing_files': False, 'eval_metric': 'Accuracy', 'early_stopping_rounds': 100, **CB_DEVICE})

Best trial: 31. Best value: 0.956993: 100%|██████████| 75/75 [26:24<00:00, 21.13s/it]


Best CatBoost CV: 0.956993
Best params: {'learning_rate': 0.06037233501239291, 'depth': 8, 'l2_leaf_reg': 0.24566928171583854, 'border_count': 68, 'bagging_temperature': 0.3576537029657124, 'random_strength': 0.14575253908437555}





In [9]:
# Cell 9: Stacking Ensemble
#
# Generate out-of-fold predictions from each tuned model,
# then train a logistic regression meta-learner on top.

def get_oof_predictions(model_class, params, X, y, X_test, n_folds=N_FOLDS, fit_kwargs=None):
    """Returns (oof_probs, test_probs_avg) for a given model."""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    oof_probs = np.zeros(len(X))
    test_probs = np.zeros(len(X_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model = model_class(**params)
        if fit_kwargs:
            kwargs = {}
            if 'lgb' in fit_kwargs:
                kwargs = {
                    'eval_set': [(X_val, y_val)],
                    'callbacks': [lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)]
                }
            elif 'xgb' in fit_kwargs:
                kwargs = {
                    'eval_set': [(X_val, y_val)],
                    'verbose': False
                }
            elif 'cb' in fit_kwargs:
                kwargs = {
                    'eval_set': (X_val, y_val),
                    'verbose': 0
                }
            model.fit(X_tr, y_tr, **kwargs)
        else:
            model.fit(X_tr, y_tr)
        
        oof_probs[val_idx] = model.predict_proba(X_val)[:, 1]
        test_probs += model.predict_proba(X_test)[:, 1] / n_folds
        
        acc = accuracy_score(y_val, (oof_probs[val_idx] > 0.5).astype(int))
        print(f'  Fold {fold+1}: {acc:.6f}')
    
    return oof_probs, test_probs

# --- LightGBM OOF ---
print('LightGBM OOF:')
oof_lgb, test_lgb = get_oof_predictions(
    lgb.LGBMClassifier, best_lgb_params, X_train, y_train, X_test, fit_kwargs='lgb'
)
print(f'  LGB OOF accuracy: {accuracy_score(y_train, (oof_lgb > 0.5).astype(int)):.6f}\n')

# --- XGBoost OOF ---
print('XGBoost OOF:')
oof_xgb, test_xgb = get_oof_predictions(
    xgb.XGBClassifier, best_xgb_params, X_train, y_train, X_test, fit_kwargs='xgb'
)
print(f'  XGB OOF accuracy: {accuracy_score(y_train, (oof_xgb > 0.5).astype(int)):.6f}\n')

# --- CatBoost OOF ---
print('CatBoost OOF:')
oof_cb, test_cb = get_oof_predictions(
    CatBoostClassifier, best_cb_params, X_train, y_train, X_test, fit_kwargs='cb'
)
print(f'  CB OOF accuracy: {accuracy_score(y_train, (oof_cb > 0.5).astype(int)):.6f}\n')

# --- Meta-learner: Logistic Regression ---
oof_stack = np.column_stack([oof_lgb, oof_xgb, oof_cb])
test_stack = np.column_stack([test_lgb, test_xgb, test_cb])

meta = LogisticRegression(C=1.0, random_state=SEED, max_iter=1000)
meta.fit(oof_stack, y_train)

oof_meta_probs = meta.predict_proba(oof_stack)[:, 1]
meta_acc = accuracy_score(y_train, meta.predict(oof_stack))
print(f'Stacked ensemble OOF accuracy: {meta_acc:.6f}')
print(f'Meta-learner coefficients: {meta.coef_[0]}')

# --- Simple weighted average comparison ---
# Optimize weights via grid search
best_wavg_acc = 0
best_weights = None
for w1 in np.arange(0.1, 0.8, 0.05):
    for w2 in np.arange(0.1, 0.8, 0.05):
        w3 = 1 - w1 - w2
        if w3 < 0.05:
            continue
        avg_probs = w1 * oof_lgb + w2 * oof_xgb + w3 * oof_cb
        acc = accuracy_score(y_train, (avg_probs > 0.5).astype(int))
        if acc > best_wavg_acc:
            best_wavg_acc = acc
            best_weights = (w1, w2, w3)

print(f'\nBest weighted average OOF accuracy: {best_wavg_acc:.6f}')
print(f'Best weights (LGB, XGB, CB): ({best_weights[0]:.2f}, {best_weights[1]:.2f}, {best_weights[2]:.2f})')

LightGBM OOF:
  Fold 1: 0.954651
  Fold 2: 0.957618
  Fold 3: 0.959260
  Fold 4: 0.958095
  Fold 5: 0.957298
  LGB OOF accuracy: 0.957385

XGBoost OOF:
  Fold 1: 0.955181
  Fold 2: 0.957671
  Fold 3: 0.959207
  Fold 4: 0.957936
  Fold 5: 0.956609
  XGB OOF accuracy: 0.957321

CatBoost OOF:
  Fold 1: 0.954016
  Fold 2: 0.957989
  Fold 3: 0.958042
  Fold 4: 0.958042
  Fold 5: 0.956874
  CB OOF accuracy: 0.956993

Stacked ensemble OOF accuracy: 0.957353
Meta-learner coefficients: [2.8281161  1.81549566 4.08693858]

Best weighted average OOF accuracy: 0.957819
Best weights (LGB, XGB, CB): (0.25, 0.40, 0.35)


In [10]:
# Cell 10: Threshold Optimization
#
# The default threshold is 0.5. With mild class imbalance (54.7/45.3),
# a slightly different threshold may improve accuracy.

# Use whichever OOF probabilities performed better: meta-learner or weighted average
use_meta = meta_acc >= best_wavg_acc
if use_meta:
    oof_final = oof_meta_probs
    print('Using meta-learner (logistic regression) probabilities')
else:
    oof_final = best_weights[0] * oof_lgb + best_weights[1] * oof_xgb + best_weights[2] * oof_cb
    print(f'Using weighted average probabilities ({best_weights})')

# Sweep thresholds
thresholds = np.arange(0.40, 0.61, 0.005)
threshold_accs = []
for t in thresholds:
    acc = accuracy_score(y_train, (oof_final > t).astype(int))
    threshold_accs.append(acc)

best_threshold_idx = np.argmax(threshold_accs)
best_threshold = thresholds[best_threshold_idx]
best_threshold_acc = threshold_accs[best_threshold_idx]

print(f'\nDefault (0.5) accuracy:  {accuracy_score(y_train, (oof_final > 0.5).astype(int)):.6f}')
print(f'Best threshold: {best_threshold:.3f} → accuracy: {best_threshold_acc:.6f}')

# Show top thresholds
sorted_idx = np.argsort(threshold_accs)[::-1][:5]
print('\nTop 5 thresholds:')
for i in sorted_idx:
    print(f'  {thresholds[i]:.3f} → {threshold_accs[i]:.6f}')

Using weighted average probabilities ((np.float64(0.25000000000000006), np.float64(0.40000000000000013), np.float64(0.34999999999999987)))

Default (0.5) accuracy:  0.957819
Best threshold: 0.520 → accuracy: 0.957830

Top 5 thresholds:
  0.530 → 0.957830
  0.520 → 0.957830
  0.500 → 0.957819
  0.525 → 0.957819
  0.505 → 0.957819


In [11]:
# Cell 11: Seed Averaging (Variance Reduction)
#
# Re-train the full pipeline with multiple seeds and average test predictions.

seeds = [42, 123, 456, 789, 2024]
all_test_probs = []

for seed in seeds:
    print(f'\n--- Seed {seed} ---')
    
    # Update seeds in params (GPU settings are inherited from best_*_params)
    lgb_params_s = {**best_lgb_params, 'random_state': seed}
    xgb_params_s = {**best_xgb_params, 'random_state': seed}
    cb_params_s = {**best_cb_params, 'random_seed': seed}
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    
    # LGB (CPU — no CUDA support in this build)
    oof_l = np.zeros(len(X_train))
    test_l = np.zeros(len(X_test))
    for train_idx, val_idx in skf.split(X_train, y_train):
        m = lgb.LGBMClassifier(**lgb_params_s)
        m.fit(X_train[train_idx], y_train[train_idx],
              eval_set=[(X_train[val_idx], y_train[val_idx])],
              callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)])
        oof_l[val_idx] = m.predict_proba(X_train[val_idx])[:, 1]
        test_l += m.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # XGB (GPU when USE_GPU=True)
    oof_x = np.zeros(len(X_train))
    test_x = np.zeros(len(X_test))
    for train_idx, val_idx in skf.split(X_train, y_train):
        m = xgb.XGBClassifier(**xgb_params_s)
        m.fit(X_train[train_idx], y_train[train_idx],
              eval_set=[(X_train[val_idx], y_train[val_idx])],
              verbose=False)
        oof_x[val_idx] = m.predict_proba(X_train[val_idx])[:, 1]
        test_x += m.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # CatBoost (GPU when USE_GPU=True)
    oof_c = np.zeros(len(X_train))
    test_c = np.zeros(len(X_test))
    for train_idx, val_idx in skf.split(X_train, y_train):
        m = CatBoostClassifier(**cb_params_s)
        m.fit(X_train[train_idx], y_train[train_idx],
              eval_set=(X_train[val_idx], y_train[val_idx]),
              verbose=0)
        oof_c[val_idx] = m.predict_proba(X_train[val_idx])[:, 1]
        test_c += m.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # Combine this seed's predictions using the method chosen in Cell 10
    if use_meta:
        # Re-fit meta-learner on this seed's OOF
        oof_s = np.column_stack([oof_l, oof_x, oof_c])
        test_s = np.column_stack([test_l, test_x, test_c])
        meta_s = LogisticRegression(C=1.0, random_state=seed, max_iter=1000)
        meta_s.fit(oof_s, y_train)
        seed_test_probs = meta_s.predict_proba(test_s)[:, 1]
        seed_oof_acc = accuracy_score(y_train, meta_s.predict(oof_s))
    else:
        seed_test_probs = best_weights[0] * test_l + best_weights[1] * test_x + best_weights[2] * test_c
        oof_avg = best_weights[0] * oof_l + best_weights[1] * oof_x + best_weights[2] * oof_c
        seed_oof_acc = accuracy_score(y_train, (oof_avg > best_threshold).astype(int))
    
    print(f'  Seed {seed} OOF accuracy: {seed_oof_acc:.6f}')
    all_test_probs.append(seed_test_probs)

# Average across all seeds
final_test_probs = np.mean(all_test_probs, axis=0)
print(f'\nSeed averaging complete. {len(seeds)} seeds averaged.')


--- Seed 42 ---
  Seed 42 OOF accuracy: 0.957830

--- Seed 123 ---
  Seed 123 OOF accuracy: 0.957554

--- Seed 456 ---
  Seed 456 OOF accuracy: 0.957544

--- Seed 789 ---
  Seed 789 OOF accuracy: 0.957554

--- Seed 2024 ---
  Seed 2024 OOF accuracy: 0.957130

Seed averaging complete. 5 seeds averaged.


In [12]:
# Cell 12: Generate Submission

final_preds = (final_test_probs > best_threshold).astype(int)

submission = pd.DataFrame({
    'ID': test_ids,
    'Overall_Experience': final_preds
})

submission.to_csv(f'{DATA_DIR}/submission_2.csv', index=False)

print(f'Submission saved: submission_2.csv')
print(f'Shape: {submission.shape}')
print(f'Prediction distribution:\n{submission["Overall_Experience"].value_counts()}')
print(f'\nThreshold used: {best_threshold:.3f}')
print(f'\nFirst 5 rows:')
submission.head()

Submission saved: submission_2.csv
Shape: (35602, 2)
Prediction distribution:
Overall_Experience
1    19117
0    16485
Name: count, dtype: int64

Threshold used: 0.520

First 5 rows:


Unnamed: 0,ID,Overall_Experience
0,99900001,1
1,99900002,1
2,99900003,1
3,99900004,0
4,99900005,1
