# Baseline Model: Random Forest with Feature Engineering

Following the strategy:
- Extract Title from Name
- Create FamilySize, IsAlone features
- Create family/group survival features (target encoding)
- Use Stratified K-Fold CV (k=5)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
# Feature Engineering Function
def engineer_features(df, is_train=True):
    """Apply all feature engineering"""
    df = df.copy()
    
    # 1. Extract Title from Name
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
        'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
        'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Extract Surname for family grouping
    df['Surname'] = df['Name'].str.split(',').str[0]
    
    # 4. Has_Cabin feature
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    # 5. Deck from Cabin
    df['Deck'] = df['Cabin'].str[0].fillna('Unknown')
    
    # 6. Fill missing Age using median by Title
    age_by_title = df.groupby('Title')['Age'].transform('median')
    df['Age'] = df['Age'].fillna(age_by_title)
    # If still missing, use overall median
    df['Age'] = df['Age'].fillna(df['Age'].median())
    
    # 7. Fill missing Embarked with mode
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 8. Fill missing Fare with median
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # 9. Age*Class interaction
    df['Age_Class'] = df['Age'] * df['Pclass']
    
    # 10. Name length
    df['Name_Length'] = df['Name'].str.len()
    
    # 11. Ticket group size
    ticket_counts = df['Ticket'].value_counts()
    df['TicketGroupSize'] = df['Ticket'].map(ticket_counts)
    
    return df

# Apply feature engineering
train_fe = engineer_features(train, is_train=True)
test_fe = engineer_features(test, is_train=False)

print("Feature engineering complete!")
print(f"Train columns: {train_fe.columns.tolist()}")
print(f"\nNew features created: Title, FamilySize, IsAlone, Surname, Has_Cabin, Deck, Age_Class, Name_Length, TicketGroupSize")

Feature engineering complete!
Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Surname', 'Has_Cabin', 'Deck', 'Age_Class', 'Name_Length', 'TicketGroupSize']

New features created: Title, FamilySize, IsAlone, Surname, Has_Cabin, Deck, Age_Class, Name_Length, TicketGroupSize


In [3]:
# Create family/group survival features (target encoding)
# IMPORTANT: This must be done carefully to avoid leakage in CV

def create_group_survival_features(train_df, test_df):
    """Create group survival features using training data only"""
    combined = pd.concat([train_df, test_df], ignore_index=True)
    
    # Family group: Surname + FamilySize
    combined['FamilyGroup'] = combined['Surname'] + '_' + combined['FamilySize'].astype(str)
    
    # Compute survival rates from training data only
    train_part = combined[combined['Survived'].notna()].copy()
    
    # Family group survival
    family_survival = train_part.groupby('FamilyGroup')['Survived'].mean()
    combined['FamilySurvival'] = combined['FamilyGroup'].map(family_survival)
    
    # Ticket group survival
    ticket_survival = train_part.groupby('Ticket')['Survived'].mean()
    combined['TicketSurvival'] = combined['Ticket'].map(ticket_survival)
    
    # Fill NaN with 0.5 (neutral) for groups not in training
    combined['FamilySurvival'] = combined['FamilySurvival'].fillna(0.5)
    combined['TicketSurvival'] = combined['TicketSurvival'].fillna(0.5)
    
    # Split back
    train_out = combined[combined['Survived'].notna()].copy()
    test_out = combined[combined['Survived'].isna()].copy()
    
    return train_out, test_out

train_fe, test_fe = create_group_survival_features(train_fe, test_fe)

print("Group survival features created!")
print(f"\nFamilySurvival stats: {train_fe['FamilySurvival'].describe()}")
print(f"\nTicketSurvival stats: {train_fe['TicketSurvival'].describe()}")

Group survival features created!

FamilySurvival stats: count    891.000000
mean       0.383838
std        0.454712
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: FamilySurvival, dtype: float64

TicketSurvival stats: count    891.000000
mean       0.383838
std        0.451338
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: TicketSurvival, dtype: float64


In [4]:
# Prepare features for modeling
def prepare_features(df):
    """Prepare final feature set for modeling"""
    df = df.copy()
    
    # Encode categorical variables
    df['Sex'] = (df['Sex'] == 'male').astype(int)
    
    # Encode Embarked
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    df['Embarked'] = df['Embarked'].map(embarked_map)
    
    # Encode Title
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    df['Title'] = df['Title'].map(title_map)
    
    # Encode Deck
    deck_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8, 'Unknown': 0}
    df['Deck'] = df['Deck'].map(deck_map).fillna(0)
    
    # Select features
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
                'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Deck',
                'Age_Class', 'Name_Length', 'TicketGroupSize',
                'FamilySurvival', 'TicketSurvival']
    
    return df[features]

X = prepare_features(train_fe)
y = train_fe['Survived'].astype(int)
X_test = prepare_features(test_fe)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nFeatures: {X.columns.tolist()}")

X shape: (891, 17)
y shape: (891,)
X_test shape: (418, 17)

Features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Deck', 'Age_Class', 'Name_Length', 'TicketGroupSize', 'FamilySurvival', 'TicketSurvival']


In [5]:
# Cross-validation with Stratified K-Fold
from sklearn.metrics import accuracy_score

# Random Forest model
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    rf.fit(X_train, y_train)
    val_pred = rf.predict(X_val)
    score = accuracy_score(y_val, val_pred)
    scores.append(score)
    print(f"Fold {fold+1}: Accuracy = {score:.4f}")

mean_score = np.mean(scores)
std_score = np.std(scores)
print(f"\nCV Accuracy: {mean_score:.4f} ± {std_score:.4f}")

Fold 1: Accuracy = 0.9832


Fold 2: Accuracy = 0.9888


Fold 3: Accuracy = 0.9944


Fold 4: Accuracy = 1.0000


Fold 5: Accuracy = 0.9775

CV Accuracy: 0.9888 ± 0.0079


In [6]:
# Train final model on full training data
rf_final = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X, y)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance.to_string(index=False))

Feature Importance:
        feature  importance
 TicketSurvival    0.401635
 FamilySurvival    0.363073
            Sex    0.058258
          Title    0.057007
      Age_Class    0.028423
    Name_Length    0.018662
         Pclass    0.012439
           Fare    0.011141
           Deck    0.010520
            Age    0.009982
TicketGroupSize    0.008687
      Has_Cabin    0.005717
     FamilySize    0.005007
          SibSp    0.004576
          Parch    0.002169
        IsAlone    0.001600
       Embarked    0.001104


In [7]:
# Generate predictions
predictions = rf_final.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions.astype(int)
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved! Shape: {submission.shape}")
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())
print(f"\nFirst 10 rows:")
print(submission.head(10))

Submission saved! Shape: (418, 2)

Prediction distribution:
Survived
0    258
1    160
Name: count, dtype: int64

First 10 rows:
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         1
8          900         1
9          901         0
