# Baseline Model: XGBoost with Feature Engineering

Following the seed prompt strategy:
1. Feature engineering (Group, Cabin, Spending features)
2. Group-based missing value imputation
3. XGBoost with recommended hyperparameters
4. 5-fold StratifiedKFold cross-validation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Transported'].value_counts(normalize=True))

Train shape: (8693, 14)
Test shape: (4277, 13)

Target distribution:
Transported
True     0.503624
False    0.496376
Name: proportion, dtype: float64


In [2]:
def feature_engineering(df, is_train=True):
    """Apply feature engineering based on strategy"""
    df = df.copy()
    
    # 1. PassengerId Extraction
    df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: x.split('_')[1]).astype(int)
    
    # 2. Cabin Feature Parsing
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else np.nan)
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else np.nan)
    
    # 3. Spending Features
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    
    # Binary spending indicators
    for col in spending_cols:
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
    
    df['AnySpending'] = (df['TotalSpent'] > 0).astype(int)
    
    # Log transform spending features
    for col in spending_cols + ['TotalSpent']:
        df[f'{col}_log'] = np.log1p(df[col])
    
    # 4. Age Features
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    
    # 5. Name Features - Extract surname
    df['Surname'] = df['Name'].apply(lambda x: x.split()[-1] if pd.notna(x) else np.nan)
    
    return df

# Apply feature engineering
train = feature_engineering(train, is_train=True)
test = feature_engineering(test, is_train=False)

print("Feature engineering complete")
print(f"Train columns: {train.shape[1]}")

Feature engineering complete
Train columns: 38


In [3]:
# Calculate GroupSize from combined train+test for consistency
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()

train['GroupSize'] = train['Group'].map(group_sizes)
test['GroupSize'] = test['Group'].map(group_sizes)

train['Solo'] = (train['GroupSize'] == 1).astype(int)
test['Solo'] = (test['GroupSize'] == 1).astype(int)

print(f"GroupSize distribution:")
print(train['GroupSize'].value_counts().sort_index())

GroupSize distribution:
GroupSize
1    4805
2    1682
3    1020
4     412
5     265
6     174
7     231
8     104
Name: count, dtype: int64


In [4]:
def impute_missing(df, train_df=None):
    """Impute missing values using group information and domain knowledge"""
    df = df.copy()
    
    # CryoSleep passengers should have 0 spending
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    
    # Categorical columns - impute with mode
    cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
    for col in cat_cols:
        if df[col].isna().any():
            mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
            df[col] = df[col].fillna(mode_val)
    
    # Numerical columns - impute with median
    num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']
    for col in num_cols:
        if df[col].isna().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    # Recalculate derived features after imputation
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    for col in spending_cols:
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
    df['AnySpending'] = (df['TotalSpent'] > 0).astype(int)
    for col in spending_cols + ['TotalSpent']:
        df[f'{col}_log'] = np.log1p(df[col])
    
    # Age features
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    
    return df

# Apply imputation
train = impute_missing(train)
test = impute_missing(test)

print("Missing values after imputation:")
print(train.isnull().sum()[train.isnull().sum() > 0])

Missing values after imputation:
Cabin      199
Name       200
Surname    200
dtype: int64


In [5]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    # Fit on combined data to handle unseen categories
    combined = pd.concat([train[col].astype(str), test[col].astype(str)])
    le.fit(combined)
    train[col + '_enc'] = le.transform(train[col].astype(str))
    test[col + '_enc'] = le.transform(test[col].astype(str))
    label_encoders[col] = le

print("Encoding complete")

Encoding complete


In [None]:
# Define features for model
feature_cols = [
    # Encoded categoricals
    'HomePlanet_enc', 'CryoSleep_enc', 'Destination_enc', 'VIP_enc', 'Deck_enc', 'Side_enc',
    # Numerical
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    # Engineered
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo',
    'TotalSpent', 'AnySpending',
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log', 'TotalSpent_log',
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior'
]

X = train[feature_cols].values
y = train['Transported'].astype(int).values
X_test = test[feature_cols].values

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# XGBoost with recommended hyperparameters from strategy
xgb_params = {
    'max_depth': 5,
    'learning_rate': 0.067,
    'n_estimators': 850,
    'reg_lambda': 3.06,
    'reg_alpha': 4.58,
    'colsample_bytree': 0.92,
    'subsample': 0.95,
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'logloss'
}

# 5-fold Stratified Cross-Validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predictions
    val_pred_proba = model.predict_proba(X_val)[:, 1]
    val_pred = (val_pred_proba >= 0.5).astype(int)
    
    oof_preds[val_idx] = val_pred_proba
    test_preds += model.predict_proba(X_test)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.5f}")

print(f"\nMean CV Accuracy: {np.mean(fold_scores):.5f} (+/- {np.std(fold_scores):.5f})")

In [None]:
# Overall OOF accuracy
oof_binary = (oof_preds >= 0.5).astype(int)
overall_acc = accuracy_score(y, oof_binary)
print(f"Overall OOF Accuracy: {overall_acc:.5f}")

# Create submission
test_binary = (test_preds >= 0.5).astype(bool)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved with {len(submission)} predictions")
print(submission.head())

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 features:")
print(importance_df.head(15))