# Experiment 003: Random Forest WITHOUT Age

Following the evolved strategy (Loop 3):
- **Key insight**: Age is the primary source of distribution shift (56.9% in adversarial validation)
- **Hypothesis**: Removing Age will narrow the CV-LB gap
- **Previous analysis**: RF without Age achieved 0.8384 CV (best so far)

Features to use:
- Pclass, Sex, SibSp, Parch, Fare, Embarked
- Title, FamilySize, IsAlone, Has_Cabin
- NO Age, NO Age_Bin, NO IsChild

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
def engineer_features_no_age(df):
    """Feature engineering WITHOUT Age-related features"""
    df = df.copy()
    
    # 1. Title extraction (MOST IMPORTANT - captures age/gender info indirectly)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Cabin features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    return df

# Apply feature engineering
train = engineer_features_no_age(train)
test = engineer_features_no_age(test)

print("Titles in train:", train['Title'].value_counts().to_dict())

In [None]:
def fill_missing_values_no_age(train_df, test_df):
    """Fill missing values - NO Age imputation needed!"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Embarked: Fill with mode from training data
    embarked_mode = train_df['Embarked'].mode()[0]
    train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)
    
    # Fare: Fill with median by Pclass from training data
    for pclass in [1, 2, 3]:
        fare_median = train_df[train_df['Pclass'] == pclass]['Fare'].median()
        train_df.loc[(train_df['Pclass'] == pclass) & (train_df['Fare'].isna()), 'Fare'] = fare_median
        test_df.loc[(test_df['Pclass'] == pclass) & (test_df['Fare'].isna()), 'Fare'] = fare_median
    
    return train_df, test_df

train, test = fill_missing_values_no_age(train, test)

print("Missing values after imputation:")
print("Train Embarked:", train['Embarked'].isna().sum())
print("Train Fare:", train['Fare'].isna().sum())
print("Test Embarked:", test['Embarked'].isna().sum())
print("Test Fare:", test['Fare'].isna().sum())

In [None]:
def prepare_features_no_age(train_df, test_df):
    """Prepare features WITHOUT Age"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Encode categorical variables
    categorical_cols = ['Sex', 'Embarked', 'Title']
    
    for col in categorical_cols:
        le = LabelEncoder()
        combined = pd.concat([train_df[col], test_df[col]])
        le.fit(combined)
        train_df[col] = le.transform(train_df[col])
        test_df[col] = le.transform(test_df[col])
    
    # Features WITHOUT Age (key change from previous experiments)
    features = [
        'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked',
        'Title', 'FamilySize', 'IsAlone', 'Has_Cabin'
    ]
    
    X_train = train_df[features]
    y_train = train_df['Survived']
    X_test = test_df[features]
    
    return X_train, y_train, X_test, features

X_train, y_train, X_test, features = prepare_features_no_age(train, test)

print(f"Features ({len(features)}): {features}")
print(f"\nNOTE: Age is NOT included - this is intentional!")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Train Random Forest with Stratified 5-Fold CV
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(rf, X_train, y_train, cv=skf, scoring='accuracy')

print(f"CV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.5f} (+/- {cv_scores.std():.5f})")
print(f"Min: {cv_scores.min():.5f}, Max: {cv_scores.max():.5f}")
print(f"\nComparison:")
print(f"  exp_000 (RF with Age): CV 0.8339 → LB 0.7799")
print(f"  exp_001 (Stacking):    CV 0.8271 → LB 0.7727")
print(f"  exp_002 (RF no Age):   CV {cv_scores.mean():.4f} → LB ???")

In [None]:
# Train on full training data
rf.fit(X_train, y_train)

# Feature importance
importance_df = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (without Age):")
print(importance_df.to_string(index=False))
print("\nNote: Title now captures some age-related signal indirectly (Master=young boys, etc.)")

In [None]:
# Make predictions on test set
test_predictions = rf.predict(X_test)

# Load original test data for PassengerId
test_original = pd.read_csv('/home/data/test.csv')

# Create submission
submission = pd.DataFrame({
    'PassengerId': test_original['PassengerId'],
    'Survived': test_predictions
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(submission.head())
print(f"\nPrediction distribution: {pd.Series(test_predictions).value_counts().to_dict()}")

In [None]:
# Summary
print("="*60)
print("EXPERIMENT 003 SUMMARY: RF WITHOUT AGE")
print("="*60)
print(f"Model: RandomForest (n_estimators=200, max_depth=6)")
print(f"Features: {len(features)} features (NO Age!)")
print(f"\nCV Accuracy: {cv_scores.mean():.5f} (+/- {cv_scores.std():.5f})")
print(f"\nComparison with previous experiments:")
print(f"  exp_000 (RF with Age):    CV 0.8339 → LB 0.7799 (gap: 5.4%)")
print(f"  exp_001 (Stacking):       CV 0.8271 → LB 0.7727 (gap: 5.4%)")
print(f"  exp_002 (RF no Age):      CV {cv_scores.mean():.4f}")
print(f"\nExpected LB (if gap narrows): ~{cv_scores.mean() - 0.04:.4f} to {cv_scores.mean() - 0.054:.4f}")
print("="*60)