# Hyperparameter Optimization with Optuna

**Goal:** Improve CV score above 0.84 through Bayesian optimization

**Current best:** Voting Ensemble CV 0.8372, LB 0.7727
**Target:** CV > 0.84 (need +0.0028 improvement)

**Approach:**
1. Optimize XGBoost hyperparameters with Optuna
2. Optimize RandomForest hyperparameters
3. Optimize GradientBoosting hyperparameters
4. Combine optimized models in voting ensemble

In [1]:
import pandas as pd
import numpy as np
import re
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (891, 12)
Test shape: (418, 11)


In [2]:
# Feature engineering (same as voting ensemble exp_001)
def extract_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def process_features(df):
    df = df.copy()
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                                        'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FamilySize_Bin'] = pd.cut(df['FamilySize'], bins=[0, 1, 4, 11], labels=[0, 1, 2]).astype(int)
    
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')
    
    df['Embarked'] = df['Embarked'].fillna('S')
    
    if df['Fare'].isna().any():
        df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    
    return df

train_processed = process_features(train)
test_processed = process_features(test)

# Age imputation
def impute_age(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    age_medians = train_df.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
    
    def get_median_age(row, medians, fallback_median):
        if pd.isna(row['Age']):
            try:
                return medians.loc[(row['Pclass'], row['Sex'], row['Title'])]
            except KeyError:
                return fallback_median
        return row['Age']
    
    fallback = train_df['Age'].median()
    train_df['Age'] = train_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    test_df['Age'] = test_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    return train_df, test_df

train_processed, test_processed = impute_age(train_processed, test_processed)

# Age_Bin
train_processed['Age_Bin'] = pd.cut(train_processed['Age'], bins=[0, 16, 32, 48, 100], labels=[0, 1, 2, 3]).astype(int)
test_processed['Age_Bin'] = pd.cut(test_processed['Age'], bins=[0, 16, 32, 48, 100], labels=[0, 1, 2, 3]).astype(int)

print("Features processed.")

Features processed.


In [3]:
# Encode categorical features
def encode_features(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    train_df['Sex'] = train_df['Sex'].map({'female': 0, 'male': 1})
    test_df['Sex'] = test_df['Sex'].map({'female': 0, 'male': 1})
    
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
    test_df['Embarked'] = test_df['Embarked'].map(embarked_map)
    
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    train_df['Title'] = train_df['Title'].map(title_map)
    test_df['Title'] = test_df['Title'].map(title_map)
    
    deck_map = {'U': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    train_df['Deck'] = train_df['Deck'].map(deck_map)
    test_df['Deck'] = test_df['Deck'].map(deck_map)
    
    return train_df, test_df

train_encoded, test_encoded = encode_features(train_processed, test_processed)

# Feature set (same as voting ensemble)
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 
            'Deck', 'FamilySize_Bin', 'Age_Bin']

X = train_encoded[features].values
y = train_encoded['Survived'].values
X_test = test_encoded[features].values

print(f"Feature matrix: {X.shape}")
print(f"Features: {features}")

Feature matrix: (891, 14)
Features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Deck', 'FamilySize_Bin', 'Age_Bin']


In [4]:
# Optuna optimization for XGBoost
print("="*60)
print("OPTIMIZING XGBOOST")
print("="*60)

def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2.0),
    }
    
    model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric='logloss')
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    return scores.mean()

xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=50, show_progress_bar=False)

print(f"\nBest XGBoost CV: {xgb_study.best_value:.4f}")
print(f"Best params: {xgb_study.best_params}")

OPTIMIZING XGBOOST



Best XGBoost CV: 0.8372
Best params: {'n_estimators': 116, 'max_depth': 5, 'learning_rate': 0.03149834299080512, 'subsample': 0.7547458558957117, 'colsample_bytree': 0.7726130043755545, 'min_child_weight': 4, 'gamma': 0.1987168339648069, 'reg_alpha': 0.48095482073340695, 'reg_lambda': 0.6462179360882688}


In [5]:
# Optuna optimization for RandomForest
print("\n" + "="*60)
print("OPTIMIZING RANDOM FOREST")
print("="*60)

def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    }
    
    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    return scores.mean()

rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(rf_objective, n_trials=50, show_progress_bar=False)

print(f"\nBest RandomForest CV: {rf_study.best_value:.4f}")
print(f"Best params: {rf_study.best_params}")


OPTIMIZING RANDOM FOREST



Best RandomForest CV: 0.8406
Best params: {'n_estimators': 185, 'max_depth': 8, 'min_samples_split': 19, 'min_samples_leaf': 1, 'max_features': None}


In [6]:
# Optuna optimization for GradientBoosting
print("\n" + "="*60)
print("OPTIMIZING GRADIENT BOOSTING")
print("="*60)

def gb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
    }
    
    model = GradientBoostingClassifier(**params, random_state=42)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    return scores.mean()

gb_study = optuna.create_study(direction='maximize')
gb_study.optimize(gb_objective, n_trials=50, show_progress_bar=False)

print(f"\nBest GradientBoosting CV: {gb_study.best_value:.4f}")
print(f"Best params: {gb_study.best_params}")


OPTIMIZING GRADIENT BOOSTING



Best GradientBoosting CV: 0.8417
Best params: {'n_estimators': 106, 'max_depth': 8, 'learning_rate': 0.03179473190235844, 'min_samples_split': 15, 'min_samples_leaf': 10, 'subsample': 0.962115034882979}


In [7]:
# Summary of optimized models
print("\n" + "="*60)
print("OPTIMIZATION SUMMARY")
print("="*60)

print(f"\nOptimized single model CV scores:")
print(f"  XGBoost:          {xgb_study.best_value:.4f}")
print(f"  RandomForest:     {rf_study.best_value:.4f}")
print(f"  GradientBoosting: {gb_study.best_value:.4f}")
print(f"\nPrevious voting ensemble: 0.8372")
print(f"Best optimized single model: {max(xgb_study.best_value, rf_study.best_value, gb_study.best_value):.4f}")


OPTIMIZATION SUMMARY

Optimized single model CV scores:
  XGBoost:          0.8372
  RandomForest:     0.8406
  GradientBoosting: 0.8417

Previous voting ensemble: 0.8372
Best optimized single model: 0.8417


In [8]:
# Create optimized voting ensemble
print("\n" + "="*60)
print("OPTIMIZED VOTING ENSEMBLE")
print("="*60)

# Create models with optimized hyperparameters
optimized_xgb = XGBClassifier(**xgb_study.best_params, random_state=42, 
                              use_label_encoder=False, eval_metric='logloss')
optimized_rf = RandomForestClassifier(**rf_study.best_params, random_state=42, n_jobs=-1)
optimized_gb = GradientBoostingClassifier(**gb_study.best_params, random_state=42)

# Keep other models with original params (they weren't the bottleneck)
models_optimized = [
    ('lr', LogisticRegression(C=0.1, max_iter=1000, random_state=42)),
    ('rf', optimized_rf),
    ('gb', optimized_gb),
    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=6, min_samples_leaf=4, random_state=42)),
    ('ada', AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)),
    ('svc', SVC(kernel='rbf', C=1.0, probability=True, random_state=42)),
    ('xgb', optimized_xgb)
]

print("Optimized ensemble models:")
for name, model in models_optimized:
    print(f"  - {name}")


OPTIMIZED VOTING ENSEMBLE
Optimized ensemble models:
  - lr
  - rf
  - gb
  - et
  - ada
  - svc
  - xgb


In [9]:
# 10-fold CV for optimized ensemble
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores_opt = []
oof_predictions_opt = np.zeros(len(X))
test_predictions_opt = np.zeros(len(X_test))

individual_scores_opt = {name: [] for name, _ in models_optimized}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    fold_probs = np.zeros((len(X_val), 2))
    test_fold_probs = np.zeros((len(X_test), 2))
    
    for name, model in models_optimized:
        from sklearn.base import clone
        m = clone(model)
        
        if name in ['lr', 'svc']:
            m.fit(X_train_scaled, y_train)
            val_prob = m.predict_proba(X_val_scaled)
            test_prob = m.predict_proba(X_test_scaled)
            val_pred = m.predict(X_val_scaled)
        else:
            m.fit(X_train, y_train)
            val_prob = m.predict_proba(X_val)
            test_prob = m.predict_proba(X_test)
            val_pred = m.predict(X_val)
        
        fold_probs += val_prob
        test_fold_probs += test_prob
        individual_scores_opt[name].append(accuracy_score(y_val, val_pred))
    
    fold_probs /= len(models_optimized)
    test_fold_probs /= len(models_optimized)
    
    val_pred_ensemble = (fold_probs[:, 1] >= 0.5).astype(int)
    oof_predictions_opt[val_idx] = val_pred_ensemble
    test_predictions_opt += test_fold_probs[:, 1] / kfold.n_splits
    
    fold_acc = accuracy_score(y_val, val_pred_ensemble)
    cv_scores_opt.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

print(f"\n{'='*50}")
print(f"Optimized Ensemble CV: {np.mean(cv_scores_opt):.4f} (+/- {np.std(cv_scores_opt):.4f})")
print(f"Previous Voting Ensemble: 0.8372 (+/- 0.0239)")
print(f"Change: {np.mean(cv_scores_opt) - 0.8372:+.4f}")

Fold 1: Accuracy = 0.8778


Fold 2: Accuracy = 0.8090


Fold 3: Accuracy = 0.8315


Fold 4: Accuracy = 0.8652


Fold 5: Accuracy = 0.8090


Fold 6: Accuracy = 0.8539


Fold 7: Accuracy = 0.8652


Fold 8: Accuracy = 0.7978


Fold 9: Accuracy = 0.8427


Fold 10: Accuracy = 0.8090

Optimized Ensemble CV: 0.8361 (+/- 0.0274)
Previous Voting Ensemble: 0.8372 (+/- 0.0239)
Change: -0.0011


In [10]:
# Individual model performance
print("\nIndividual Model Performance (Optimized):")
print("="*50)
for name, scores in individual_scores_opt.items():
    print(f"{name:5s}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

print(f"\nEnsemble: {np.mean(cv_scores_opt):.4f} (+/- {np.std(cv_scores_opt):.4f})")


Individual Model Performance (Optimized):
lr   : 0.8136 (+/- 0.0329)
rf   : 0.8406 (+/- 0.0294)
gb   : 0.8417 (+/- 0.0235)
et   : 0.8316 (+/- 0.0216)
ada  : 0.8193 (+/- 0.0279)
svc  : 0.8282 (+/- 0.0249)
xgb  : 0.8372 (+/- 0.0313)

Ensemble: 0.8361 (+/- 0.0274)


In [11]:
# Generate submission
print("\n" + "="*60)
print("GENERATING SUBMISSION")
print("="*60)

test_pred_binary = (test_predictions_opt >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved to /home/submission/submission.csv")
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())
print(f"\nComparison:")
print(f"  Original Voting: 255 died, 163 survived")
print(f"  Optimized:       {(test_pred_binary == 0).sum()} died, {(test_pred_binary == 1).sum()} survived")


GENERATING SUBMISSION
Submission saved to /home/submission/submission.csv

Prediction distribution:
Survived
0    262
1    156
Name: count, dtype: int64

Comparison:
  Original Voting: 255 died, 163 survived
  Optimized:       262 died, 156 survived


In [None]:
# Final summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)

print(f"\nCV Score Comparison:")
print(f"  Baseline XGBoost (exp_000):     0.8316")
print(f"  Voting Ensemble (exp_001):      0.8372")
print(f"  Stacking (exp_002):             0.8293")
print(f"  Adversarial+Regularized (exp_003): 0.8327")
print(f"  Optuna Optimized (exp_004):     {np.mean(cv_scores_opt):.4f}")

print(f"\nBest CV: {max(0.8372, np.mean(cv_scores_opt)):.4f}")
print(f"\nExpected LB (using calibration LB = 2.55*CV - 1.37):")
print(f"  Optimized: {2.55 * np.mean(cv_scores_opt) - 1.37:.4f}")
print(f"  Previous best LB: 0.7727")