# Experiment 002: Advanced Feature Engineering

Addressing AnySpending dominance (0.82 importance) by:
1. Removing AnySpending from features
2. Creating spending ratios and categories
3. Adding interaction features (CryoSleep × HomePlanet is key!)
4. Group-based features
5. Better imputation using group information

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
def advanced_feature_engineering(df):
    """Advanced feature engineering based on EDA insights"""
    df = df.copy()
    
    # 1. PassengerId Extraction
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    
    # 2. Cabin Feature Parsing
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else np.nan)
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else np.nan)
    
    # 3. Name Features - Extract surname for family grouping
    df['Surname'] = df['Name'].apply(lambda x: x.split()[-1] if pd.notna(x) else np.nan)
    
    return df

# Apply basic feature engineering
train = advanced_feature_engineering(train)
test = advanced_feature_engineering(test)
print("Basic feature engineering complete")

In [None]:
# Calculate GroupSize from combined train+test
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()

for df in [train, test]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)

print(f"GroupSize distribution: {train['GroupSize'].value_counts().sort_index().to_dict()}")

In [None]:
def group_based_imputation(train_df, test_df):
    """Impute missing values using group information - groups share characteristics"""
    
    # Combine for group-based imputation
    combined = pd.concat([train_df, test_df], ignore_index=True)
    
    # Group-based imputation for HomePlanet (100% consistency within groups)
    group_homeplanet = combined.groupby('Group')['HomePlanet'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan
    ).to_dict()
    
    # Group-based imputation for Deck (passengers in same group often share deck)
    group_deck = combined.groupby('Group')['Deck'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan
    ).to_dict()
    
    # Group-based imputation for Side
    group_side = combined.groupby('Group')['Side'].apply(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan
    ).to_dict()
    
    for df in [train_df, test_df]:
        # Fill HomePlanet from group
        mask = df['HomePlanet'].isna()
        df.loc[mask, 'HomePlanet'] = df.loc[mask, 'Group'].map(group_homeplanet)
        
        # Fill Deck from group
        mask = df['Deck'].isna()
        df.loc[mask, 'Deck'] = df.loc[mask, 'Group'].map(group_deck)
        
        # Fill Side from group
        mask = df['Side'].isna()
        df.loc[mask, 'Side'] = df.loc[mask, 'Group'].map(group_side)
    
    return train_df, test_df

train, test = group_based_imputation(train, test)
print("Group-based imputation complete")

In [None]:
def impute_remaining(df):
    """Impute remaining missing values"""
    df = df.copy()
    
    # CryoSleep passengers have 0 spending (domain knowledge)
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    
    # Categorical columns - impute with mode
    cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
    for col in cat_cols:
        if df[col].isna().any():
            mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
            df[col] = df[col].fillna(mode_val)
    
    # Numerical columns - impute with median
    num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']
    for col in num_cols:
        if df[col].isna().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    return df

train = impute_remaining(train)
test = impute_remaining(test)
print(f"Missing values after imputation: {train.isnull().sum().sum()} in train, {test.isnull().sum().sum()} in test")

In [None]:
def create_spending_features(df):
    """Create nuanced spending features to reduce AnySpending dominance"""
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # Total spending
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    
    # Spending ratios (what fraction of spending goes to each category)
    for col in spending_cols:
        df[f'{col}_ratio'] = df[col] / (df['TotalSpent'] + 1)
    
    # Luxury spending (Spa, VRDeck, RoomService) vs Basic (FoodCourt, ShoppingMall)
    df['LuxurySpent'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['BasicSpent'] = df['FoodCourt'] + df['ShoppingMall']
    df['LuxuryRatio'] = df['LuxurySpent'] / (df['TotalSpent'] + 1)
    
    # Spending per age (spending behavior varies by age)
    df['SpentPerAge'] = df['TotalSpent'] / (df['Age'] + 1)
    
    # Spending bins (none, low, medium, high)
    df['SpendingBin'] = pd.cut(df['TotalSpent'], 
                               bins=[-1, 0, 500, 2000, float('inf')],
                               labels=[0, 1, 2, 3]).astype(int)
    
    # Binary indicators for each spending category
    for col in spending_cols:
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
    
    # Log transform spending features
    for col in spending_cols + ['TotalSpent', 'LuxurySpent', 'BasicSpent']:
        df[f'{col}_log'] = np.log1p(df[col])
    
    # Number of spending categories used
    df['NumSpendingCategories'] = sum(df[f'{col}_spent'] for col in spending_cols)
    
    # Key combinations from top kernels
    df['Spa_VRDeck_RoomService'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['FoodCourt_RoomService'] = df['FoodCourt'] + df['RoomService']
    
    return df

train = create_spending_features(train)
test = create_spending_features(test)
print("Spending features created")

In [None]:
def create_interaction_features(df):
    """Create interaction features based on EDA insights"""
    df = df.copy()
    
    # CryoSleep × HomePlanet (HUGE signal: Europa+CryoSleep = 98.9% transported)
    df['CryoSleep_HomePlanet'] = df['CryoSleep'].astype(str) + '_' + df['HomePlanet'].astype(str)
    
    # CryoSleep × Destination
    df['CryoSleep_Destination'] = df['CryoSleep'].astype(str) + '_' + df['Destination'].astype(str)
    
    # Deck × Side
    df['Deck_Side'] = df['Deck'].astype(str) + '_' + df['Side'].astype(str)
    
    # HomePlanet × Destination
    df['HomePlanet_Destination'] = df['HomePlanet'].astype(str) + '_' + df['Destination'].astype(str)
    
    # Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 17, 25, 40, 60, 100], 
                            labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'MiddleAge', 'Senior'])
    
    # Age × CryoSleep
    df['AgeGroup_CryoSleep'] = df['AgeGroup'].astype(str) + '_' + df['CryoSleep'].astype(str)
    
    # VIP × HomePlanet
    df['VIP_HomePlanet'] = df['VIP'].astype(str) + '_' + df['HomePlanet'].astype(str)
    
    # Age features
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    
    return df

train = create_interaction_features(train)
test = create_interaction_features(test)
print("Interaction features created")

In [None]:
# Encode categorical variables
cat_cols_to_encode = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side',
    'CryoSleep_HomePlanet', 'CryoSleep_Destination', 'Deck_Side',
    'HomePlanet_Destination', 'AgeGroup', 'AgeGroup_CryoSleep', 'VIP_HomePlanet'
]

label_encoders = {}
for col in cat_cols_to_encode:
    le = LabelEncoder()
    combined = pd.concat([train[col].astype(str), test[col].astype(str)])
    le.fit(combined)
    train[col + '_enc'] = le.transform(train[col].astype(str))
    test[col + '_enc'] = le.transform(test[col].astype(str))
    label_encoders[col] = le

print(f"Encoded {len(cat_cols_to_encode)} categorical columns")

In [None]:
# Define features - EXCLUDING AnySpending to force model to learn from other signals
feature_cols = [
    # Encoded categoricals
    'HomePlanet_enc', 'CryoSleep_enc', 'Destination_enc', 'VIP_enc', 'Deck_enc', 'Side_enc',
    # Interaction features (encoded)
    'CryoSleep_HomePlanet_enc', 'CryoSleep_Destination_enc', 'Deck_Side_enc',
    'HomePlanet_Destination_enc', 'AgeGroup_enc', 'AgeGroup_CryoSleep_enc', 'VIP_HomePlanet_enc',
    # Numerical
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    # Group features
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo',
    # Spending features (NOT AnySpending)
    'TotalSpent', 'LuxurySpent', 'BasicSpent', 'LuxuryRatio', 'SpentPerAge', 'SpendingBin',
    'NumSpendingCategories', 'Spa_VRDeck_RoomService', 'FoodCourt_RoomService',
    # Spending ratios
    'RoomService_ratio', 'FoodCourt_ratio', 'ShoppingMall_ratio', 'Spa_ratio', 'VRDeck_ratio',
    # Binary spending indicators
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    # Log spending
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalSpent_log', 'LuxurySpent_log', 'BasicSpent_log',
    # Age features
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior'
]

X = train[feature_cols].values
y = train['Transported'].astype(int).values
X_test = test[feature_cols].values

print(f"X shape: {X.shape}")
print(f"Features: {len(feature_cols)}")

In [None]:
# XGBoost with same hyperparameters as baseline
xgb_params = {
    'max_depth': 5,
    'learning_rate': 0.067,
    'n_estimators': 850,
    'reg_lambda': 3.06,
    'reg_alpha': 4.58,
    'colsample_bytree': 0.92,
    'subsample': 0.95,
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'logloss'
}

# 5-fold Stratified Cross-Validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    
    val_pred_proba = model.predict_proba(X_val)[:, 1]
    val_pred = (val_pred_proba >= 0.5).astype(int)
    
    oof_preds[val_idx] = val_pred_proba
    test_preds += model.predict_proba(X_test)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.5f}")

print(f"\nMean CV Accuracy: {np.mean(fold_scores):.5f} (+/- {np.std(fold_scores):.5f})")

In [None]:
# Overall OOF accuracy
oof_binary = (oof_preds >= 0.5).astype(int)
overall_acc = accuracy_score(y, oof_binary)
print(f"Overall OOF Accuracy: {overall_acc:.5f}")

# Compare to baseline
baseline_acc = 0.80674
improvement = overall_acc - baseline_acc
print(f"Improvement over baseline: {improvement:+.5f} ({improvement*100:+.2f}%)")

In [None]:
# Feature importance - check if AnySpending dominance is reduced
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 features:")
print(importance_df.head(20))

# Check if importance is more distributed
print(f"\nTop feature importance: {importance_df.iloc[0]['importance']:.4f}")
print(f"Top 5 features account for: {importance_df.head(5)['importance'].sum():.4f}")

In [None]:
# Create submission
test_binary = (test_preds >= 0.5).astype(bool)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(submission.head())