# Baseline Model - Titanic Survival Prediction

This baseline implements:
- Feature engineering: Title, FamilySize, IsAlone, Deck, HasCabin, AgeBin, FareBin
- Age imputation using median by Pclass/Sex/Title
- XGBoost with 5-fold Stratified CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
# Feature Engineering Function
def engineer_features(df, is_train=True):
    df = df.copy()
    
    # 1. Title extraction from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    title_mapping = {
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
        'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare', 
        'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    df['Title'] = df['Title'].replace(title_mapping)
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Deck from Cabin
    df['Deck'] = df['Cabin'].str[0].fillna('U')  # U for Unknown
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    
    # 4. Sex encoding
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    # 5. Embarked - fill missing with mode 'S'
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 6. Fare - fill missing with median by Pclass
    if df['Fare'].isna().any():
        for pclass in [1, 2, 3]:
            median_fare = df[df['Pclass'] == pclass]['Fare'].median()
            df.loc[(df['Fare'].isna()) & (df['Pclass'] == pclass), 'Fare'] = median_fare
    
    return df

In [None]:
# Apply feature engineering
train_fe = engineer_features(train, is_train=True)
test_fe = engineer_features(test, is_train=False)

print("Title distribution:")
print(train_fe['Title'].value_counts())
print("\nDeck distribution:")
print(train_fe['Deck'].value_counts())

In [None]:
# Age imputation using median by Pclass/Sex/Title
def impute_age(train_df, test_df):
    combined = pd.concat([train_df, test_df], ignore_index=True)
    
    # Calculate median age for each group
    age_medians = combined.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
    
    def fill_age(row):
        if pd.isna(row['Age']):
            try:
                return age_medians[row['Pclass'], row['Sex'], row['Title']]
            except KeyError:
                # Fallback to Pclass/Sex median
                return combined[(combined['Pclass'] == row['Pclass']) & 
                               (combined['Sex'] == row['Sex'])]['Age'].median()
        return row['Age']
    
    train_df['Age'] = train_df.apply(fill_age, axis=1)
    test_df['Age'] = test_df.apply(fill_age, axis=1)
    
    # Final fallback for any remaining NaN
    overall_median = combined['Age'].median()
    train_df['Age'] = train_df['Age'].fillna(overall_median)
    test_df['Age'] = test_df['Age'].fillna(overall_median)
    
    return train_df, test_df

train_fe, test_fe = impute_age(train_fe, test_fe)
print(f"Age missing in train: {train_fe['Age'].isna().sum()}")
print(f"Age missing in test: {test_fe['Age'].isna().sum()}")

In [None]:
# Create Age and Fare bins
def create_bins(train_df, test_df):
    # AgeBin - 5 bins
    train_df['AgeBin'] = pd.cut(train_df['Age'], bins=5, labels=[0, 1, 2, 3, 4])
    test_df['AgeBin'] = pd.cut(test_df['Age'], bins=5, labels=[0, 1, 2, 3, 4])
    
    # FareBin - 4 quantile bins (handle edge cases)
    train_df['FareBin'] = pd.qcut(train_df['Fare'], q=4, labels=[0, 1, 2, 3], duplicates='drop')
    
    # For test, use same bin edges from train
    fare_bins = pd.qcut(train_df['Fare'], q=4, retbins=True, duplicates='drop')[1]
    test_df['FareBin'] = pd.cut(test_df['Fare'], bins=fare_bins, labels=[0, 1, 2, 3], include_lowest=True)
    
    # Fill any NaN in bins
    train_df['AgeBin'] = train_df['AgeBin'].astype(float).fillna(2).astype(int)
    test_df['AgeBin'] = test_df['AgeBin'].astype(float).fillna(2).astype(int)
    train_df['FareBin'] = train_df['FareBin'].astype(float).fillna(1).astype(int)
    test_df['FareBin'] = test_df['FareBin'].astype(float).fillna(1).astype(int)
    
    return train_df, test_df

train_fe, test_fe = create_bins(train_fe, test_fe)
print("AgeBin distribution:")
print(train_fe['AgeBin'].value_counts().sort_index())

In [None]:
# Encode categorical features
def encode_features(train_df, test_df):
    # Label encode Title, Embarked, Deck
    for col in ['Title', 'Embarked', 'Deck']:
        le = LabelEncoder()
        combined = pd.concat([train_df[col], test_df[col]])
        le.fit(combined)
        train_df[col + '_Code'] = le.transform(train_df[col])
        test_df[col + '_Code'] = le.transform(test_df[col])
    
    return train_df, test_df

train_fe, test_fe = encode_features(train_fe, test_fe)
print("Encoded features created")

In [None]:
# Select features for modeling
feature_cols = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'FamilySize', 'IsAlone', 'HasCabin',
    'Title_Code', 'Embarked_Code', 'Deck_Code',
    'AgeBin', 'FareBin'
]

X = train_fe[feature_cols].values
y = train_fe['Survived'].values
X_test = test_fe[feature_cols].values

print(f"Features: {feature_cols}")
print(f"X shape: {X.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# 5-Fold Stratified Cross-Validation with XGBoost
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # XGBoost model
    model = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    # Predictions
    val_pred_proba = model.predict_proba(X_val)[:, 1]
    val_pred = (val_pred_proba > 0.5).astype(int)
    
    oof_preds[val_idx] = val_pred_proba
    test_preds += model.predict_proba(X_test)[:, 1] / 5
    
    # Calculate accuracy
    fold_acc = (val_pred == y_val).mean()
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

mean_acc = np.mean(fold_scores)
std_acc = np.std(fold_scores)
print(f"\nCV Accuracy: {mean_acc:.4f} Â± {std_acc:.4f}")

In [None]:
# Feature importance
import matplotlib.pyplot as plt

feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

print("\nFeature Importance:")
for i in sorted_idx[:10]:
    print(f"  {feature_cols[i]}: {feature_importance[i]:.4f}")

In [None]:
# Create submission
test_pred_binary = (test_preds > 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(submission.head())
print(f"\nSurvival rate in predictions: {test_pred_binary.mean():.4f}")