# Baseline Model - XGBoost with Feature Engineering

Following the strategy:
1. Title extraction from Name
2. FamilySize and IsAlone features
3. Has_Cabin feature
4. Age imputation using median by Pclass, Sex, Title
5. XGBoost with 10-fold Stratified CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
def extract_title(name):
    """Extract title from name using regex"""
    import re
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def process_features(df, is_train=True, age_medians=None):
    """Process features for train/test data"""
    df = df.copy()
    
    # 1. Title extraction
    df['Title'] = df['Name'].apply(extract_title)
    
    # Group rare titles
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                                        'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Has_Cabin feature
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    # 4. Embarked - fill missing with mode
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 5. Fare - fill missing with median by Pclass
    if df['Fare'].isna().any():
        df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    
    return df

# Process train and test
train_processed = process_features(train, is_train=True)
test_processed = process_features(test, is_train=False)

print("Title distribution:")
print(train_processed['Title'].value_counts())

In [None]:
# Age imputation using median by Pclass, Sex, Title
# Calculate medians from training data only
def impute_age(train_df, test_df):
    """Impute age using median by Pclass, Sex, Title"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Calculate medians from training data
    age_medians = train_df.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
    
    # Function to get median age
    def get_median_age(row, medians):
        if pd.isna(row['Age']):
            try:
                return medians.loc[(row['Pclass'], row['Sex'], row['Title'])]
            except KeyError:
                # Fallback to overall median
                return train_df['Age'].median()
        return row['Age']
    
    train_df['Age'] = train_df.apply(lambda x: get_median_age(x, age_medians), axis=1)
    test_df['Age'] = test_df.apply(lambda x: get_median_age(x, age_medians), axis=1)
    
    return train_df, test_df

train_processed, test_processed = impute_age(train_processed, test_processed)

print(f"Missing values after imputation:")
print(f"Train Age missing: {train_processed['Age'].isna().sum()}")
print(f"Test Age missing: {test_processed['Age'].isna().sum()}")

In [None]:
# Encode categorical features
def encode_features(train_df, test_df):
    """Encode categorical features"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Sex encoding
    train_df['Sex'] = train_df['Sex'].map({'female': 0, 'male': 1})
    test_df['Sex'] = test_df['Sex'].map({'female': 0, 'male': 1})
    
    # Embarked encoding
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
    test_df['Embarked'] = test_df['Embarked'].map(embarked_map)
    
    # Title encoding
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    train_df['Title'] = train_df['Title'].map(title_map)
    test_df['Title'] = test_df['Title'].map(title_map)
    
    return train_df, test_df

train_encoded, test_encoded = encode_features(train_processed, test_processed)

# Select features for modeling
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin']

X = train_encoded[features].values
y = train_encoded['Survived'].values
X_test = test_encoded[features].values

print(f"Feature matrix shape: {X.shape}")
print(f"Test matrix shape: {X_test.shape}")
print(f"\nFeatures: {features}")

In [None]:
# 10-fold Stratified Cross-Validation with XGBoost
from sklearn.metrics import accuracy_score

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# XGBoost parameters from strategy
xgb_params = {
    'learning_rate': 0.1,
    'max_depth': 4,
    'n_estimators': 200,
    'gamma': 0.9,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 42,
    'use_label_encoder': False
}

cv_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, verbose=False)
    
    # Predictions
    val_pred = model.predict(X_val)
    oof_predictions[val_idx] = val_pred
    
    # Accumulate test predictions
    test_predictions += model.predict_proba(X_test)[:, 1] / kfold.n_splits
    
    # Calculate fold accuracy
    fold_acc = accuracy_score(y_val, val_pred)
    cv_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

print(f"\n{'='*50}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"Overall OOF Accuracy: {accuracy_score(y, oof_predictions):.4f}")

In [None]:
# Generate submission
test_pred_binary = (test_predictions >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved to /home/submission/submission.csv")
print(f"\nSubmission preview:")
print(submission.head(10))
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())

In [None]:
# Feature importance
import matplotlib.pyplot as plt

# Train final model on all data for feature importance
final_model = xgb.XGBClassifier(**xgb_params)
final_model.fit(X, y, verbose=False)

importance = pd.DataFrame({
    'feature': features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance)