# Baseline XGBoost Model for Titanic Survival Prediction

This notebook implements a baseline model with:
- Feature engineering (Title, FamilySize, Has_Cabin, Deck)
- Age imputation by Title/Pclass/Sex
- XGBoost with 5-fold Stratified CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
# Feature Engineering Function
def engineer_features(df, is_train=True):
    data = df.copy()
    
    # 1. Title extraction from Name
    data['Title'] = data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
        'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
        'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'] = data['Title'].fillna('Rare')  # Handle any unmapped titles
    
    # 2. Family features
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    
    # 3. Cabin features
    data['Has_Cabin'] = data['Cabin'].notna().astype(int)
    data['Deck'] = data['Cabin'].str[0].fillna('U')  # U for Unknown
    
    # 4. Name length
    data['Name_length'] = data['Name'].apply(len)
    
    # 5. Sex encoding
    data['Sex_Code'] = (data['Sex'] == 'male').astype(int)
    
    # 6. Embarked - fill missing with mode 'S'
    data['Embarked'] = data['Embarked'].fillna('S')
    
    # 7. Fare - fill missing with median by Pclass
    if data['Fare'].isna().any():
        fare_median = data.groupby('Pclass')['Fare'].transform('median')
        data['Fare'] = data['Fare'].fillna(fare_median)
        # If still missing, use overall median
        data['Fare'] = data['Fare'].fillna(data['Fare'].median())
    
    return data

# Apply feature engineering
train_fe = engineer_features(train, is_train=True)
test_fe = engineer_features(test, is_train=False)

print("Title distribution:")
print(train_fe['Title'].value_counts())

In [None]:
# Age imputation using median by Title, Pclass, Sex
def impute_age(train_df, test_df):
    # Combine for consistent imputation
    combined = pd.concat([train_df, test_df], axis=0)
    
    # Calculate median age by Title, Pclass, Sex
    age_medians = combined.groupby(['Title', 'Pclass', 'Sex'])['Age'].median()
    
    def fill_age(row):
        if pd.isna(row['Age']):
            try:
                return age_medians[(row['Title'], row['Pclass'], row['Sex'])]
            except KeyError:
                # Fallback to Title only
                try:
                    return combined[combined['Title'] == row['Title']]['Age'].median()
                except:
                    return combined['Age'].median()
        return row['Age']
    
    train_df['Age'] = train_df.apply(fill_age, axis=1)
    test_df['Age'] = test_df.apply(fill_age, axis=1)
    
    # Final fallback for any remaining NaN
    overall_median = combined['Age'].median()
    train_df['Age'] = train_df['Age'].fillna(overall_median)
    test_df['Age'] = test_df['Age'].fillna(overall_median)
    
    return train_df, test_df

train_fe, test_fe = impute_age(train_fe, test_fe)

print(f"Missing Age in train: {train_fe['Age'].isna().sum()}")
print(f"Missing Age in test: {test_fe['Age'].isna().sum()}")

In [None]:
# Encode categorical features
def encode_features(train_df, test_df):
    # Combine for consistent encoding
    combined = pd.concat([train_df, test_df], axis=0)
    
    # Label encode categorical columns
    le_title = LabelEncoder()
    le_embarked = LabelEncoder()
    le_deck = LabelEncoder()
    
    combined['Title_Code'] = le_title.fit_transform(combined['Title'])
    combined['Embarked_Code'] = le_embarked.fit_transform(combined['Embarked'])
    combined['Deck_Code'] = le_deck.fit_transform(combined['Deck'])
    
    # Split back
    train_encoded = combined.iloc[:len(train_df)].copy()
    test_encoded = combined.iloc[len(train_df):].copy()
    
    return train_encoded, test_encoded

train_fe, test_fe = encode_features(train_fe, test_fe)

# Define feature columns
feature_cols = [
    'Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare',
    'Embarked_Code', 'Title_Code', 'FamilySize', 'IsAlone',
    'Has_Cabin', 'Deck_Code', 'Name_length'
]

print(f"\nFeatures: {feature_cols}")
print(f"Number of features: {len(feature_cols)}")

In [None]:
# Prepare data for modeling
X = train_fe[feature_cols].values
y = train_fe['Survived'].values
X_test = test_fe[feature_cols].values
test_ids = test_fe['PassengerId'].values

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# 5-Fold Stratified Cross-Validation with XGBoost
from sklearn.metrics import accuracy_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# XGBoost parameters
xgb_params = {
    'n_estimators': 200,
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'eval_metric': 'logloss',
    'use_label_encoder': False
}

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, verbose=False)
    
    # Predict on validation
    val_pred = model.predict(X_val)
    oof_preds[val_idx] = val_pred
    
    # Predict on test
    test_preds += model.predict_proba(X_test)[:, 1] / kfold.n_splits
    
    # Calculate fold accuracy
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

# Overall CV score
cv_score = accuracy_score(y, oof_preds)
print(f"\n{'='*50}")
print(f"Overall CV Accuracy: {cv_score:.4f} (+/- {np.std(fold_scores):.4f})")
print(f"Mean Fold Accuracy: {np.mean(fold_scores):.4f}")

In [None]:
# Create submission
test_preds_binary = (test_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': test_preds_binary
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} rows")
print(submission.head(10))
print(f"\nSurvived distribution in submission:")
print(submission['Survived'].value_counts())

In [None]:
# Feature importance
import matplotlib.pyplot as plt

# Train final model on all data for feature importance
final_model = xgb.XGBClassifier(**xgb_params)
final_model.fit(X, y, verbose=False)

importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance.to_string(index=False))