# Baseline Random Forest with Feature Engineering

This baseline implements:
- Title extraction from Name
- Family features (FamilySize, IsAlone)
- Has_Cabin binary flag
- Missing value imputation
- 5-fold Stratified CV

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'\nTarget distribution:')
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
def feature_engineering(df, is_train=True):
    """Apply feature engineering to dataframe"""
    df = df.copy()
    
    # 1. Title extraction from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    
    # Normalize titles
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # Map to ordinal
    title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    df['Title'] = df['Title'].map(title_mapping).fillna(0)
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Has_Cabin binary flag
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    # 4. Name length
    df['Name_length'] = df['Name'].apply(len)
    
    # 5. Sex encoding
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    # 6. Embarked encoding
    df['Embarked'] = df['Embarked'].fillna('S')  # Fill with mode
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # 7. Age imputation - fill with median by Pclass and Sex
    for pclass in [1, 2, 3]:
        for sex in [0, 1]:
            mask = (df['Pclass'] == pclass) & (df['Sex'] == sex)
            median_age = df.loc[mask, 'Age'].median()
            if pd.isna(median_age):
                median_age = df['Age'].median()
            df.loc[mask & df['Age'].isna(), 'Age'] = median_age
    
    # Fill any remaining NaN ages with overall median
    df['Age'] = df['Age'].fillna(df['Age'].median())
    
    # 8. Fare imputation
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    return df

# Apply feature engineering
train_fe = feature_engineering(train, is_train=True)
test_fe = feature_engineering(test, is_train=False)

print('Feature engineering complete!')
print(f'Train columns: {train_fe.columns.tolist()}')
print(f'\nMissing values in train after FE:')
print(train_fe.isnull().sum()[train_fe.isnull().sum() > 0])

Feature engineering complete!
Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Name_length']

Missing values in train after FE:
Cabin    687
dtype: int64


In [3]:
# Select features for modeling
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Name_length']

X = train_fe[features].values
y = train_fe['Survived'].values
X_test = test_fe[features].values

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'X_test shape: {X_test.shape}')

# Check for any NaN values
print(f'\nNaN in X: {np.isnan(X).sum()}')
print(f'NaN in X_test: {np.isnan(X_test).sum()}')

X shape: (891, 12)
y shape: (891,)
X_test shape: (418, 12)

NaN in X: 0
NaN in X_test: 0


In [4]:
# Random Forest with 5-fold Stratified CV
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=6,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation scores
cv_scores = cross_val_score(rf, X, y, cv=skf, scoring='accuracy')

print('5-Fold Stratified Cross-Validation Results:')
print(f'Fold scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})')

5-Fold Stratified Cross-Validation Results:
Fold scores: [0.83798883 0.82022472 0.79213483 0.82022472 0.83707865]
Mean CV Accuracy: 0.8215 (+/- 0.0166)


In [5]:
# Train on full training data and make predictions
rf.fit(X, y)

# Feature importances
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print('Feature Importances:')
print(feature_importance.to_string(index=False))

Feature Importances:
    feature  importance
      Title    0.256882
        Sex    0.217659
       Fare    0.107770
Name_length    0.087179
     Pclass    0.085661
        Age    0.071725
 FamilySize    0.056126
  Has_Cabin    0.049027
      SibSp    0.029582
   Embarked    0.017572
      Parch    0.013188
    IsAlone    0.007628


In [6]:
# Make predictions on test set
test_predictions = rf.predict(X_test)

# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f'Submission saved with {len(submission)} predictions')
print(submission.head(10))

# Also save to experiment folder
submission.to_csv('/home/code/experiments/001_baseline/submission.csv', index=False)
print('\nSubmission also saved to experiment folder')

Submission saved with 418 predictions
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0

Submission also saved to experiment folder
