# Titanic Baseline - Comprehensive Feature Engineering

Implementing all key features from the strategy:
- Title extraction from Name
- Family features (FamilySize, IsAlone)
- Cabin features (Has_Cabin, Deck)
- Age/Fare handling with binning
- 10-fold Stratified CV

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f'Train shape: {train.shape}, Test shape: {test.shape}')
print(f'Target distribution: {train.Survived.value_counts(normalize=True).to_dict()}')

In [None]:
# Feature Engineering Function
def engineer_features(df):
    df = df.copy()
    
    # 1. Title extraction from Name (MOST IMPORTANT)
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    
    # Map rare titles and normalize
    title_mapping = {
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 
        'Col': 'Rare', 'Don': 'Rare', 'Dr': 'Rare', 
        'Major': 'Rare', 'Rev': 'Rare', 'Sir': 'Rare', 
        'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    df['Title'] = df['Title'].replace(title_mapping)
    
    # Keep only common titles
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Rare']
    df['Title'] = df['Title'].apply(lambda x: x if x in common_titles else 'Rare')
    
    # 2. Family Features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Family size categories
    df['FamilySizeCategory'] = pd.cut(df['FamilySize'], 
                                       bins=[0, 1, 4, 11], 
                                       labels=['Single', 'Small', 'Large'])
    
    # 3. Cabin Features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].str[0].fillna('U')  # U for Unknown
    
    # 4. Age Handling - Fill missing with median by Pclass and Sex
    df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(
        lambda x: x.fillna(x.median())
    )
    # If still missing, use overall median
    df['Age'] = df['Age'].fillna(df['Age'].median())
    
    # Age bands
    df['AgeBand'] = pd.cut(df['Age'], bins=[0, 16, 32, 48, 64, 100], 
                           labels=['Child', 'Young', 'Middle', 'Senior', 'Elder'])
    
    # 5. Fare Handling
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['FareBand'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'], duplicates='drop')
    
    # 6. Embarked - fill missing with mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # 7. Additional features
    df['Name_Length'] = df['Name'].apply(len)
    
    return df

# Apply feature engineering
train_fe = engineer_features(train)
test_fe = engineer_features(test)

print('Feature engineering complete!')
print(f'Title distribution: {train_fe.Title.value_counts().to_dict()}')
print(f'FamilySize distribution: {train_fe.FamilySize.value_counts().sort_index().to_dict()}')

In [None]:
# Encode categorical features
def encode_features(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Categorical columns to encode
    cat_cols = ['Sex', 'Embarked', 'Title', 'Deck', 'AgeBand', 'FareBand', 'FamilySizeCategory']
    
    for col in cat_cols:
        le = LabelEncoder()
        # Fit on combined data to handle unseen categories
        combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
        le.fit(combined)
        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))
    
    return train_df, test_df

train_encoded, test_encoded = encode_features(train_fe, test_fe)

# Select features for modeling
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
                'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Deck',
                'AgeBand', 'FareBand', 'FamilySizeCategory', 'Name_Length']

X = train_encoded[feature_cols].values
y = train_encoded['Survived'].values
X_test = test_encoded[feature_cols].values

print(f'Feature matrix shape: {X.shape}')
print(f'Features: {feature_cols}')

In [None]:
# 10-Fold Stratified Cross-Validation with RandomForest
from sklearn.metrics import accuracy_score

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# RandomForest baseline
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

rf_scores = cross_val_score(rf_model, X, y, cv=skf, scoring='accuracy')
print(f'RandomForest CV Accuracy: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}')
print(f'Fold scores: {[f"{s:.4f}" for s in rf_scores]}')

# GradientBoosting for comparison
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_scores = cross_val_score(gb_model, X, y, cv=skf, scoring='accuracy')
print(f'\nGradientBoosting CV Accuracy: {gb_scores.mean():.4f} ± {gb_scores.std():.4f}')
print(f'Fold scores: {[f"{s:.4f}" for s in gb_scores]}')

In [None]:
# Train final model on full data and make predictions
# Using GradientBoosting as it typically performs better
final_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
final_model.fit(X, y)

# Predictions
predictions = final_model.predict(X_test)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print('Feature Importance:')
print(feature_importance.to_string(index=False))

In [None]:
# Create submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f'Submission saved with {len(submission)} predictions')
print(submission.head())
print(f'\nPrediction distribution: {submission.Survived.value_counts().to_dict()}')