# Baseline: Random Forest with Feature Engineering

Based on strategy recommendations:
- Title extraction from Name
- Family features (FamilySize, IsAlone)
- Has_Cabin feature
- Proper missing value handling
- Stratified K-Fold CV (k=5)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
# Feature Engineering Function
def engineer_features(df):
    """Apply feature engineering to dataframe"""
    df = df.copy()
    
    # 1. Title extraction from Name (MOST IMPORTANT)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Map rare titles
    title_mapping = {
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare',
        'Col': 'Rare', 'Don': 'Rare', 'Dr': 'Rare',
        'Major': 'Rare', 'Rev': 'Rare', 'Sir': 'Rare',
        'Jonkheer': 'Rare', 'Dona': 'Rare'
    }
    df['Title'] = df['Title'].replace(title_mapping)
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Has_Cabin feature
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    # 4. Name length (correlates with social status)
    df['Name_Length'] = df['Name'].apply(len)
    
    return df

# Apply feature engineering
train = engineer_features(train)
test = engineer_features(test)

print("Title distribution:")
print(train['Title'].value_counts())

In [None]:
# Handle missing values
def fill_missing(train_df, test_df):
    """Fill missing values using training data statistics"""
    # Combine for consistent processing
    combined = pd.concat([train_df, test_df], sort=False)
    
    # Age: Fill with median by Title, Pclass, Sex
    age_medians = combined.groupby(['Title', 'Pclass', 'Sex'])['Age'].median()
    
    for df in [train_df, test_df]:
        for idx in df[df['Age'].isna()].index:
            title = df.loc[idx, 'Title']
            pclass = df.loc[idx, 'Pclass']
            sex = df.loc[idx, 'Sex']
            try:
                df.loc[idx, 'Age'] = age_medians[title, pclass, sex]
            except KeyError:
                # Fallback to overall median
                df.loc[idx, 'Age'] = combined['Age'].median()
    
    # Embarked: Fill with mode
    mode_embarked = train_df['Embarked'].mode()[0]
    train_df['Embarked'].fillna(mode_embarked, inplace=True)
    test_df['Embarked'].fillna(mode_embarked, inplace=True)
    
    # Fare: Fill with median by Pclass
    for df in [train_df, test_df]:
        if df['Fare'].isna().any():
            fare_median = train_df.groupby('Pclass')['Fare'].median()
            for idx in df[df['Fare'].isna()].index:
                pclass = df.loc[idx, 'Pclass']
                df.loc[idx, 'Fare'] = fare_median[pclass]
    
    return train_df, test_df

train, test = fill_missing(train, test)

print("Missing values after filling:")
print(f"Train Age missing: {train['Age'].isna().sum()}")
print(f"Test Age missing: {test['Age'].isna().sum()}")
print(f"Test Fare missing: {test['Fare'].isna().sum()}")

In [None]:
# Encode categorical variables
def encode_features(train_df, test_df):
    """Encode categorical features"""
    # Sex encoding
    train_df['Sex_Code'] = (train_df['Sex'] == 'male').astype(int)
    test_df['Sex_Code'] = (test_df['Sex'] == 'male').astype(int)
    
    # Title encoding
    le_title = LabelEncoder()
    combined_titles = pd.concat([train_df['Title'], test_df['Title']])
    le_title.fit(combined_titles)
    train_df['Title_Code'] = le_title.transform(train_df['Title'])
    test_df['Title_Code'] = le_title.transform(test_df['Title'])
    
    # Embarked encoding
    le_embarked = LabelEncoder()
    combined_embarked = pd.concat([train_df['Embarked'], test_df['Embarked']])
    le_embarked.fit(combined_embarked)
    train_df['Embarked_Code'] = le_embarked.transform(train_df['Embarked'])
    test_df['Embarked_Code'] = le_embarked.transform(test_df['Embarked'])
    
    return train_df, test_df

train, test = encode_features(train, test)

print("Encoded features created successfully")

In [None]:
# Select features for model
feature_cols = [
    'Pclass', 'Sex_Code', 'Age', 'Fare',
    'Title_Code', 'FamilySize', 'IsAlone',
    'Embarked_Code', 'Has_Cabin', 'SibSp', 'Parch', 'Name_Length'
]

X = train[feature_cols].values
y = train['Survived'].values
X_test = test[feature_cols].values

print(f"Feature matrix shape: {X.shape}")
print(f"Features: {feature_cols}")

In [None]:
# Train Random Forest with Stratified K-Fold CV
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=6,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X, y, cv=skf, scoring='accuracy')

print(f"CV Accuracy: {cv_scores.mean():.4f} Â± {cv_scores.std():.4f}")
print(f"Fold scores: {cv_scores}")

In [None]:
# Train on full data and make predictions
rf.fit(X, y)
predictions = rf.predict(X_test)

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importances:")
print(importance_df)

In [None]:
# Create submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved with {len(submission)} predictions")
print(submission.head())
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts(normalize=True))