# Baseline Model with Feature Engineering

This notebook implements a baseline Random Forest model with key feature engineering:
- Title extraction from Name
- Family features (FamilySize, IsAlone)
- Age imputation by Title
- Cabin features (Has_Cabin)
- Fare imputation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
# Combine train and test for consistent feature engineering
train['is_train'] = 1
test['is_train'] = 0
test['Survived'] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

print(f"Combined shape: {df.shape}")

Combined shape: (1309, 13)


In [3]:
# Feature Engineering

# 1. Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print("Title distribution:")
print(df['Title'].value_counts())

Title distribution:
Title
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Major         2
Mlle          2
Ms            2
Mme           1
Don           1
Sir           1
Lady          1
Capt          1
Countess      1
Jonkheer      1
Dona          1
Name: count, dtype: int64


In [4]:
# 2. Group rare titles
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare', 'Mlle': 'Miss',
    'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare', 'Jonkheer': 'Rare',
    'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs', 'Capt': 'Rare', 'Sir': 'Rare'
}
df['Title'] = df['Title'].map(title_mapping)
print("\nGrouped Title distribution:")
print(df['Title'].value_counts())


Grouped Title distribution:
Title
Mr        757
Miss      264
Mrs       198
Master     61
Rare       29
Name: count, dtype: int64


In [5]:
# 3. Fill missing Age with median by Title
age_by_title = df.groupby('Title')['Age'].median()
print("Median Age by Title:")
print(age_by_title)

for title in df['Title'].unique():
    df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = age_by_title[title]

print(f"\nMissing Age after imputation: {df['Age'].isnull().sum()}")

Median Age by Title:
Title
Master     4.0
Miss      22.0
Mr        29.0
Mrs       35.0
Rare      47.5
Name: Age, dtype: float64

Missing Age after imputation: 0


In [6]:
# 4. Fill missing Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
print(f"Missing Embarked: {df['Embarked'].isnull().sum()}")

# 5. Fill missing Fare with median by Pclass
fare_by_pclass = df.groupby('Pclass')['Fare'].median()
for pclass in df['Pclass'].unique():
    df.loc[(df['Fare'].isnull()) & (df['Pclass'] == pclass), 'Fare'] = fare_by_pclass[pclass]
print(f"Missing Fare: {df['Fare'].isnull().sum()}")

Missing Embarked: 0
Missing Fare: 0


In [7]:
# 6. Family Features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

print("FamilySize distribution:")
print(df['FamilySize'].value_counts().sort_index())
print(f"\nIsAlone: {df['IsAlone'].value_counts()}")

FamilySize distribution:
FamilySize
1     790
2     235
3     159
4      43
5      22
6      25
7      16
8       8
11     11
Name: count, dtype: int64

IsAlone: IsAlone
1    790
0    519
Name: count, dtype: int64


In [8]:
# 7. Cabin Features
df['Has_Cabin'] = df['Cabin'].notna().astype(int)
print(f"Has_Cabin distribution: {df['Has_Cabin'].value_counts()}")

Has_Cabin distribution: Has_Cabin
0    1014
1     295
Name: count, dtype: int64


In [9]:
# 8. Age Binning
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 16, 32, 48, 64, 100], labels=[0, 1, 2, 3, 4])
df['AgeBin'] = df['AgeBin'].astype(int)
print("AgeBin distribution:")
print(df['AgeBin'].value_counts().sort_index())

AgeBin distribution:
AgeBin
0    142
1    751
2    297
3    106
4     13
Name: count, dtype: int64


In [10]:
# 9. Fare Binning
df['FareBin'] = pd.qcut(df['Fare'], q=4, labels=[0, 1, 2, 3])
df['FareBin'] = df['FareBin'].astype(int)
print("FareBin distribution:")
print(df['FareBin'].value_counts().sort_index())

FareBin distribution:
FareBin
0    337
1    321
2    328
3    323
Name: count, dtype: int64


In [11]:
# 10. Encode categorical features
le_sex = LabelEncoder()
df['Sex'] = le_sex.fit_transform(df['Sex'])

le_embarked = LabelEncoder()
df['Embarked'] = le_embarked.fit_transform(df['Embarked'])

le_title = LabelEncoder()
df['Title'] = le_title.fit_transform(df['Title'])

print("Encoded features:")
print(f"Sex: {le_sex.classes_}")
print(f"Embarked: {le_embarked.classes_}")
print(f"Title: {le_title.classes_}")

Encoded features:
Sex: ['female' 'male']
Embarked: ['C' 'Q' 'S']
Title: ['Master' 'Miss' 'Mr' 'Mrs' 'Rare']


In [12]:
# Select features for model
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'AgeBin', 'FareBin']

# Split back to train and test
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features]
y = train_df['Survived'].astype(int)
X_test = test_df[features]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

X shape: (891, 13)
y shape: (891,)
X_test shape: (418, 13)


In [13]:
# Train Random Forest with Stratified K-Fold CV
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=6,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X, y, cv=skf, scoring='accuracy')

print(f"CV Scores: {scores}")
print(f"Mean CV Accuracy: {scores.mean():.5f} (+/- {scores.std():.5f})")

CV Scores: [0.8547486  0.8258427  0.8258427  0.83146067 0.83707865]
Mean CV Accuracy: 0.83499 (+/- 0.01072)


In [14]:
# Train on full data and make predictions
rf.fit(X, y)
predictions = rf.predict(X_test)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

Feature Importance:
       feature  importance
1          Sex    0.296379
7        Title    0.173969
5         Fare    0.110058
0       Pclass    0.095698
2          Age    0.077259
8   FamilySize    0.053333
10   Has_Cabin    0.053287
12     FareBin    0.037754
3        SibSp    0.034514
11      AgeBin    0.020977
6     Embarked    0.020917
4        Parch    0.016677
9      IsAlone    0.009179


In [15]:
# Create submission
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'].astype(int),
    'Survived': predictions.astype(int)
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission shape: {submission.shape}")
print(submission.head(10))
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())

Submission shape: (418, 2)
     PassengerId  Survived
891          892         0
892          893         0
893          894         0
894          895         0
895          896         1
896          897         0
897          898         1
898          899         0
899          900         1
900          901         0

Prediction distribution:
Survived
0    268
1    150
Name: count, dtype: int64
