# Experiment 004: CatBoost-Only with Optuna Tuning

Following evaluator's insight:
- CatBoost (0.81836) outperforms ensemble (0.81353) by 0.48%
- CatBoost has lowest fold variance (std=0.00431)
- Current params are conservative - tune with Optuna for potential 0.2-0.5% gain

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

In [None]:
# Feature engineering (same as exp_002/003)
def advanced_feature_engineering(df):
    df = df.copy()
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else np.nan)
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else np.nan)
    df['Surname'] = df['Name'].apply(lambda x: x.split()[-1] if pd.notna(x) else np.nan)
    return df

train = advanced_feature_engineering(train)
test = advanced_feature_engineering(test)

# GroupSize
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()
for df in [train, test]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)
print("Basic features done")

In [None]:
# Group-based imputation
def group_based_imputation(train_df, test_df):
    combined = pd.concat([train_df, test_df], ignore_index=True)
    group_homeplanet = combined.groupby('Group')['HomePlanet'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan).to_dict()
    group_deck = combined.groupby('Group')['Deck'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan).to_dict()
    group_side = combined.groupby('Group')['Side'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan).to_dict()
    
    for df in [train_df, test_df]:
        for col, mapping in [('HomePlanet', group_homeplanet), ('Deck', group_deck), ('Side', group_side)]:
            mask = df[col].isna()
            df.loc[mask, col] = df.loc[mask, 'Group'].map(mapping)
    return train_df, test_df

train, test = group_based_imputation(train, test)

# Remaining imputation
def impute_remaining(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    return df

train = impute_remaining(train)
test = impute_remaining(test)
print("Imputation done")

In [None]:
# Spending and interaction features
def create_all_features(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # Spending features
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    for col in spending_cols:
        df[f'{col}_ratio'] = df[col] / (df['TotalSpent'] + 1)
    df['LuxurySpent'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['BasicSpent'] = df['FoodCourt'] + df['ShoppingMall']
    df['LuxuryRatio'] = df['LuxurySpent'] / (df['TotalSpent'] + 1)
    df['SpentPerAge'] = df['TotalSpent'] / (df['Age'] + 1)
    df['SpendingBin'] = pd.cut(df['TotalSpent'], bins=[-1, 0, 500, 2000, float('inf')], labels=[0, 1, 2, 3]).astype(int)
    for col in spending_cols:
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
    for col in spending_cols + ['TotalSpent', 'LuxurySpent', 'BasicSpent']:
        df[f'{col}_log'] = np.log1p(df[col])
    df['NumSpendingCategories'] = sum(df[f'{col}_spent'] for col in spending_cols)
    df['Spa_VRDeck_RoomService'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['FoodCourt_RoomService'] = df['FoodCourt'] + df['RoomService']
    
    # Interaction features
    df['CryoSleep_HomePlanet'] = df['CryoSleep'].astype(str) + '_' + df['HomePlanet'].astype(str)
    df['CryoSleep_Destination'] = df['CryoSleep'].astype(str) + '_' + df['Destination'].astype(str)
    df['Deck_Side'] = df['Deck'].astype(str) + '_' + df['Side'].astype(str)
    df['HomePlanet_Destination'] = df['HomePlanet'].astype(str) + '_' + df['Destination'].astype(str)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 17, 25, 40, 60, 100], 
                            labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'MiddleAge', 'Senior'])
    df['AgeGroup_CryoSleep'] = df['AgeGroup'].astype(str) + '_' + df['CryoSleep'].astype(str)
    df['VIP_HomePlanet'] = df['VIP'].astype(str) + '_' + df['HomePlanet'].astype(str)
    
    # Age features
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    return df

train = create_all_features(train)
test = create_all_features(test)
print("All features created")

In [None]:
# Encode categorical variables
cat_cols_to_encode = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side',
    'CryoSleep_HomePlanet', 'CryoSleep_Destination', 'Deck_Side',
    'HomePlanet_Destination', 'AgeGroup', 'AgeGroup_CryoSleep', 'VIP_HomePlanet'
]

for col in cat_cols_to_encode:
    le = LabelEncoder()
    combined = pd.concat([train[col].astype(str), test[col].astype(str)])
    le.fit(combined)
    train[col + '_enc'] = le.transform(train[col].astype(str))
    test[col + '_enc'] = le.transform(test[col].astype(str))

print(f"Encoded {len(cat_cols_to_encode)} categorical columns")

In [None]:
# Define features
feature_cols = [
    'HomePlanet_enc', 'CryoSleep_enc', 'Destination_enc', 'VIP_enc', 'Deck_enc', 'Side_enc',
    'CryoSleep_HomePlanet_enc', 'CryoSleep_Destination_enc', 'Deck_Side_enc',
    'HomePlanet_Destination_enc', 'AgeGroup_enc', 'AgeGroup_CryoSleep_enc', 'VIP_HomePlanet_enc',
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo',
    'TotalSpent', 'LuxurySpent', 'BasicSpent', 'LuxuryRatio', 'SpentPerAge', 'SpendingBin',
    'NumSpendingCategories', 'Spa_VRDeck_RoomService', 'FoodCourt_RoomService',
    'RoomService_ratio', 'FoodCourt_ratio', 'ShoppingMall_ratio', 'Spa_ratio', 'VRDeck_ratio',
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalSpent_log', 'LuxurySpent_log', 'BasicSpent_log',
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior'
]

X = train[feature_cols].values
y = train['Transported'].astype(int).values
X_test = test[feature_cols].values

print(f"X shape: {X.shape}, Features: {len(feature_cols)}")

In [None]:
# First, train CatBoost with original params to get baseline
cat_params_baseline = {
    'depth': 6,
    'learning_rate': 0.05,
    'iterations': 1000,
    'l2_leaf_reg': 3.0,
    'random_seed': 42,
    'verbose': False
}

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_cat = np.zeros(len(X))
test_cat = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(**cat_params_baseline)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    
    oof_cat[val_idx] = model.predict_proba(X_val)[:, 1]
    test_cat += model.predict_proba(X_test)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val, (oof_cat[val_idx] >= 0.5).astype(int))
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: {fold_acc:.5f}")

baseline_acc = accuracy_score(y, (oof_cat >= 0.5).astype(int))
print(f"\nCatBoost Baseline OOF Accuracy: {baseline_acc:.5f} (+/- {np.std(fold_scores):.5f})")

In [None]:
# Create CatBoost-only submission with baseline params
test_binary = (test_cat >= 0.5).astype(bool)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"CatBoost-only submission saved")
print(f"Predicted transported rate: {test_binary.mean():.4f}")
print(submission.head())

In [None]:
# Optuna hyperparameter tuning for CatBoost
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.1),
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'random_seed': 42,
        'verbose': False
    }
    
    skf_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in skf_inner.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
        
        val_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_pred))
    
    return np.mean(scores)

# Run Optuna optimization
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"\nBest trial accuracy: {study.best_trial.value:.5f}")
print(f"Best params: {study.best_trial.params}")

In [None]:
# Train final model with best params
best_params = study.best_trial.params.copy()
best_params['random_seed'] = 42
best_params['verbose'] = False

oof_tuned = np.zeros(len(X))
test_tuned = np.zeros(len(X_test))
fold_scores_tuned = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(**best_params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    
    oof_tuned[val_idx] = model.predict_proba(X_val)[:, 1]
    test_tuned += model.predict_proba(X_test)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val, (oof_tuned[val_idx] >= 0.5).astype(int))
    fold_scores_tuned.append(fold_acc)
    print(f"Fold {fold+1}: {fold_acc:.5f}")

tuned_acc = accuracy_score(y, (oof_tuned >= 0.5).astype(int))
print(f"\nTuned CatBoost OOF Accuracy: {tuned_acc:.5f} (+/- {np.std(fold_scores_tuned):.5f})")
print(f"Improvement over baseline: {tuned_acc - baseline_acc:+.5f}")

In [None]:
# Save tuned submission if better
if tuned_acc > baseline_acc:
    test_binary_tuned = (test_tuned >= 0.5).astype(bool)
    submission_tuned = pd.DataFrame({
        'PassengerId': test['PassengerId'],
        'Transported': test_binary_tuned
    })
    submission_tuned.to_csv('/home/submission/submission.csv', index=False)
    print(f"Tuned CatBoost submission saved (CV: {tuned_acc:.5f})")
    final_acc = tuned_acc
else:
    print(f"Baseline CatBoost is better, keeping original submission (CV: {baseline_acc:.5f})")
    final_acc = baseline_acc

print(f"\nFinal submission CV accuracy: {final_acc:.5f}")