# Simple Random Forest with Fewer Features

## Goal: Reduce CV-LB Gap (Overfitting)

Previous baseline: CV 0.8316, LB 0.7584 (7.3% gap)

This experiment:
- Use only 6 core features: Sex, Pclass, SibSp, Parch, Fare, Embarked
- Simple Random Forest with max_depth=5
- Minimal feature engineering to avoid overfitting
- Age imputation on TRAIN ONLY (avoid leakage)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# Minimal preprocessing - avoid complex feature engineering
def preprocess_simple(train_df, test_df):
    """Simple preprocessing with minimal features to reduce overfitting"""
    train_data = train_df.copy()
    test_data = test_df.copy()
    
    # 1. Sex encoding (most important feature)
    train_data['Sex_Code'] = (train_data['Sex'] == 'male').astype(int)
    test_data['Sex_Code'] = (test_data['Sex'] == 'male').astype(int)
    
    # 2. Embarked - fill missing with mode 'S', then encode
    train_data['Embarked'] = train_data['Embarked'].fillna('S')
    test_data['Embarked'] = test_data['Embarked'].fillna('S')
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_data['Embarked_Code'] = train_data['Embarked'].map(embarked_map)
    test_data['Embarked_Code'] = test_data['Embarked'].map(embarked_map)
    
    # 3. Fare - fill missing with median from TRAIN only
    train_fare_median = train_data['Fare'].median()
    train_data['Fare'] = train_data['Fare'].fillna(train_fare_median)
    test_data['Fare'] = test_data['Fare'].fillna(train_fare_median)
    
    # 4. Age - fill missing with median from TRAIN only (by Sex and Pclass)
    # Calculate medians from train only
    age_medians = train_data.groupby(['Sex', 'Pclass'])['Age'].median()
    
    def fill_age(row, medians, fallback):
        if pd.isna(row['Age']):
            try:
                return medians[(row['Sex'], row['Pclass'])]
            except KeyError:
                return fallback
        return row['Age']
    
    train_age_median = train_data['Age'].median()
    train_data['Age'] = train_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    test_data['Age'] = test_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    
    return train_data, test_data

train_processed, test_processed = preprocess_simple(train, test)

print(f"Missing values in train:")
print(train_processed[['Age', 'Fare', 'Embarked_Code']].isna().sum())
print(f"\nMissing values in test:")
print(test_processed[['Age', 'Fare', 'Embarked_Code']].isna().sum())

In [None]:
# Define SIMPLE feature set (6 features only)
feature_cols_simple = ['Pclass', 'Sex_Code', 'SibSp', 'Parch', 'Fare', 'Embarked_Code']

# Also test with Age added (7 features)
feature_cols_with_age = ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code']

print(f"Simple features (6): {feature_cols_simple}")
print(f"With Age (7): {feature_cols_with_age}")

In [None]:
# Prepare data
X_simple = train_processed[feature_cols_simple].values
X_with_age = train_processed[feature_cols_with_age].values
y = train_processed['Survived'].values

X_test_simple = test_processed[feature_cols_simple].values
X_test_with_age = test_processed[feature_cols_with_age].values
test_ids = test_processed['PassengerId'].values

print(f"X_simple shape: {X_simple.shape}")
print(f"X_with_age shape: {X_with_age.shape}")

In [None]:
# 5-Fold Stratified CV with Simple Random Forest
def evaluate_model(X, y, X_test, model_params, n_splits=5):
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = RandomForestClassifier(**model_params)
        model.fit(X_train, y_train)
        
        val_pred = model.predict(X_val)
        oof_preds[val_idx] = val_pred
        
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        fold_acc = accuracy_score(y_val, val_pred)
        fold_scores.append(fold_acc)
        print(f"  Fold {fold+1}: {fold_acc:.4f}")
    
    cv_score = accuracy_score(y, oof_preds)
    return cv_score, np.std(fold_scores), test_preds, fold_scores

# Simple RF parameters (conservative to avoid overfitting)
rf_params = {
    'n_estimators': 100,
    'max_depth': 5,
    'min_samples_leaf': 5,
    'max_features': 'sqrt',
    'random_state': 42,
    'n_jobs': -1
}

In [None]:
# Test 1: Simple features (6 features)
print("="*50)
print("Test 1: Simple RF with 6 features")
print("="*50)
cv_simple, std_simple, preds_simple, folds_simple = evaluate_model(
    X_simple, y, X_test_simple, rf_params
)
print(f"\nCV Accuracy: {cv_simple:.4f} (+/- {std_simple:.4f})")
print(f"Fold scores: {[f'{s:.4f}' for s in folds_simple]}")

In [None]:
# Test 2: With Age (7 features)
print("\n" + "="*50)
print("Test 2: Simple RF with 7 features (+ Age)")
print("="*50)
cv_with_age, std_with_age, preds_with_age, folds_with_age = evaluate_model(
    X_with_age, y, X_test_with_age, rf_params
)
print(f"\nCV Accuracy: {cv_with_age:.4f} (+/- {std_with_age:.4f})")
print(f"Fold scores: {[f'{s:.4f}' for s in folds_with_age]}")

In [None]:
# Compare results
print("\n" + "="*50)
print("COMPARISON")
print("="*50)
print(f"6 features: CV = {cv_simple:.4f} (+/- {std_simple:.4f})")
print(f"7 features: CV = {cv_with_age:.4f} (+/- {std_with_age:.4f})")
print(f"\nPrevious baseline (13 features XGBoost): CV = 0.8316, LB = 0.7584")
print(f"\nExpected LB (assuming 7% gap): {cv_simple - 0.07:.4f} to {cv_with_age - 0.07:.4f}")

# Choose the better model
if cv_simple >= cv_with_age:
    best_preds = preds_simple
    best_cv = cv_simple
    best_std = std_simple
    best_features = '6 features'
else:
    best_preds = preds_with_age
    best_cv = cv_with_age
    best_std = std_with_age
    best_features = '7 features'

print(f"\nUsing {best_features} for submission")

In [None]:
# Create submission with best model
test_preds_binary = (best_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': test_preds_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} rows")
print(f"\nSurvived distribution:")
print(submission['Survived'].value_counts())
print(f"\nSurvival rate: {submission['Survived'].mean():.3f}")

In [None]:
# Feature importance for the best model
final_model = RandomForestClassifier(**rf_params)
if best_features == '6 features':
    final_model.fit(X_simple, y)
    feature_names = feature_cols_simple
else:
    final_model.fit(X_with_age, y)
    feature_names = feature_cols_with_age

importance = pd.DataFrame({
    'feature': feature_names,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance.to_string(index=False))

In [None]:
# Summary
print("\n" + "="*50)
print("EXPERIMENT SUMMARY")
print("="*50)
print(f"Model: Random Forest (max_depth=5, n_estimators=100)")
print(f"Features: {best_features}")
print(f"CV Accuracy: {best_cv:.4f} (+/- {best_std:.4f})")
print(f"\nComparison to baseline:")
print(f"  Baseline XGBoost (13 features): CV=0.8316, LB=0.7584 (gap=7.3%)")
print(f"  This model ({best_features}): CV={best_cv:.4f}")
print(f"\nExpected LB improvement: Simpler model should have smaller CV-LB gap")