# Baseline Model for Titanic Survival Prediction

This notebook implements a comprehensive baseline with:
- Feature engineering (Title, FamilySize, IsAlone, Has_Cabin)
- Proper missing value handling
- 5-fold Stratified CV
- Random Forest classifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
# Combine train and test for consistent feature engineering
train['is_train'] = 1
test['is_train'] = 0
test['Survived'] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

print(f"Combined shape: {df.shape}")

Combined shape: (1309, 13)


In [3]:
# Feature Engineering

# 1. Extract Title from Name
def extract_title(name):
    import re
    match = re.search(r' ([A-Za-z]+)\.', name)
    if match:
        return match.group(1)
    return 'Unknown'

df['Title'] = df['Name'].apply(extract_title)

# Group rare titles
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare', 'Capt': 'Rare',
    'Lady': 'Rare', 'Countess': 'Rare', 'Sir': 'Rare', 'Don': 'Rare', 'Dona': 'Rare',
    'Jonkheer': 'Rare'
}
df['Title'] = df['Title'].map(lambda x: title_mapping.get(x, 'Rare'))

print("Title distribution:")
print(df['Title'].value_counts())

Title distribution:
Title
Mr        757
Miss      264
Mrs       198
Master     61
Rare       29
Name: count, dtype: int64


In [4]:
# 2. Family Features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Family size groups
def family_size_group(size):
    if size == 1:
        return 0  # Alone
    elif size <= 4:
        return 1  # Small
    else:
        return 2  # Large

df['FamilySizeGroup'] = df['FamilySize'].apply(family_size_group)

print("FamilySize distribution:")
print(df['FamilySize'].value_counts().sort_index())

FamilySize distribution:
FamilySize
1     790
2     235
3     159
4      43
5      22
6      25
7      16
8       8
11     11
Name: count, dtype: int64


In [5]:
# 3. Cabin Features
df['Has_Cabin'] = df['Cabin'].notna().astype(int)

print(f"Has_Cabin distribution:")
print(df['Has_Cabin'].value_counts())

Has_Cabin distribution:
Has_Cabin
0    1014
1     295
Name: count, dtype: int64


In [6]:
# 4. Age Imputation - using median by (Sex, Pclass, Title)
# Calculate medians from training data only
train_mask = df['is_train'] == 1

age_medians = df[train_mask].groupby(['Sex', 'Pclass', 'Title'])['Age'].median()

def impute_age(row):
    if pd.isna(row['Age']):
        try:
            return age_medians.loc[(row['Sex'], row['Pclass'], row['Title'])]
        except KeyError:
            # Fallback to Sex, Pclass median
            try:
                return df[train_mask].groupby(['Sex', 'Pclass'])['Age'].median().loc[(row['Sex'], row['Pclass'])]
            except KeyError:
                return df[train_mask]['Age'].median()
    return row['Age']

df['Age'] = df.apply(impute_age, axis=1)

print(f"Missing Age after imputation: {df['Age'].isna().sum()}")

Missing Age after imputation: 0


In [7]:
# 5. Age Binning
def age_bin(age):
    if age <= 12:
        return 0  # Child
    elif age <= 19:
        return 1  # Teen
    elif age <= 64:
        return 2  # Adult
    else:
        return 3  # Senior

df['AgeBin'] = df['Age'].apply(age_bin)

print("AgeBin distribution:")
print(df['AgeBin'].value_counts().sort_index())

AgeBin distribution:
AgeBin
0     102
1     179
2    1015
3      13
Name: count, dtype: int64


In [8]:
# 6. Fare Handling
# Fill missing fare with median by Pclass
fare_medians = df[train_mask].groupby('Pclass')['Fare'].median()

def impute_fare(row):
    if pd.isna(row['Fare']):
        return fare_medians.loc[row['Pclass']]
    return row['Fare']

df['Fare'] = df.apply(impute_fare, axis=1)

# Fare binning using quantiles from training data
fare_bins = df[train_mask]['Fare'].quantile([0, 0.25, 0.5, 0.75, 1.0]).values
fare_bins[0] = -0.001  # Handle edge case
fare_bins[-1] = df['Fare'].max() + 1

df['FareBin'] = pd.cut(df['Fare'], bins=fare_bins, labels=[0, 1, 2, 3]).astype(int)

print("FareBin distribution:")
print(df['FareBin'].value_counts().sort_index())

FareBin distribution:
FareBin
0    337
1    321
2    321
3    330
Name: count, dtype: int64


In [9]:
# 7. Embarked - fill with mode
df['Embarked'] = df['Embarked'].fillna('S')

# Encode categorical features
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df['Title'] = df['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5})

print("\nFeature encoding complete")


Feature encoding complete


In [10]:
# Select features for modeling
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 
            'Title', 'FamilySize', 'IsAlone', 'FamilySizeGroup',
            'Has_Cabin', 'AgeBin', 'FareBin']

# Split back to train and test
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df['Survived'].values
X_test = test_df[features].values
test_ids = test_df['PassengerId'].values

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

X shape: (891, 12)
y shape: (891,)
X_test shape: (418, 12)


In [11]:
# 5-Fold Stratified Cross-Validation with Random Forest
from sklearn.metrics import accuracy_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Random Forest with reasonable parameters
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        min_samples_split=4,
        min_samples_leaf=2,
        criterion='entropy',
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    # Validation predictions
    val_pred = model.predict(X_val)
    oof_predictions[val_idx] = val_pred
    
    # Test predictions (average across folds)
    test_predictions += model.predict(X_test) / 5
    
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

mean_cv = np.mean(fold_scores)
std_cv = np.std(fold_scores)
print(f"\nCV Accuracy: {mean_cv:.4f} ± {std_cv:.4f}")

Fold 1: Accuracy = 0.8492


Fold 2: Accuracy = 0.8315


Fold 3: Accuracy = 0.8315


Fold 4: Accuracy = 0.8315


Fold 5: Accuracy = 0.8371

CV Accuracy: 0.8361 ± 0.0069


In [12]:
# Create submission
final_predictions = (test_predictions >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test_ids.astype(int),
    'Survived': final_predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(submission.head())

Submission saved with 418 predictions
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1


In [13]:
# Feature importance
import matplotlib.pyplot as plt

feature_importance = model.feature_importances_
feature_df = pd.DataFrame({
    'feature': features,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_df.to_string(index=False))


Feature Importance:
        feature  importance
          Title    0.229102
            Sex    0.198011
           Fare    0.123021
            Age    0.096276
         Pclass    0.082613
     FamilySize    0.061560
      Has_Cabin    0.060400
FamilySizeGroup    0.054683
        FareBin    0.042847
         AgeBin    0.025418
       Embarked    0.018260
        IsAlone    0.007808
