# Experiment 010: Target Encoding + New Features

Fundamentally different approach to break through CV ~0.817 plateau:
1. Target encoding for categorical features (captures category-target relationships)
2. Cabin region features (spatial patterns)
3. Family size feature (family correlation)

Goal: CV > 0.82089 to beat exp_003's LB of 0.8045

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# Check if category_encoders is available
try:
    from category_encoders import TargetEncoder
    print("category_encoders available")
except ImportError:
    print("Installing category_encoders...")
    import subprocess
    subprocess.run(['pip', 'install', 'category_encoders', '-q'])
    from category_encoders import TargetEncoder
    print("category_encoders installed")

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

category_encoders available
Train: (8693, 14), Test: (4277, 13)


In [2]:
# Feature engineering with NEW features
def feature_engineering(df):
    df = df.copy()
    
    # Basic features
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else np.nan)
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else np.nan)
    
    # NEW: Extract surname for family features
    df['Surname'] = df['Name'].apply(lambda x: x.split()[-1] if pd.notna(x) else 'Unknown')
    
    return df

train = feature_engineering(train)
test = feature_engineering(test)

# GroupSize
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()
for df in [train, test]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)

# NEW: Family size from surname
all_surnames = pd.concat([train['Surname'], test['Surname']])
surname_counts = all_surnames.value_counts().to_dict()
for df in [train, test]:
    df['FamilySize'] = df['Surname'].map(surname_counts)

print("Basic features done")

Basic features done


In [3]:
# Group-based imputation
def group_based_imputation(train_df, test_df):
    combined = pd.concat([train_df, test_df], ignore_index=True)
    for col in ['HomePlanet', 'Deck', 'Side']:
        group_mode = combined.groupby('Group')[col].apply(
            lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan).to_dict()
        for df in [train_df, test_df]:
            mask = df[col].isna()
            df.loc[mask, col] = df.loc[mask, 'Group'].map(group_mode)
    return train_df, test_df

train, test = group_based_imputation(train, test)

# Remaining imputation
def impute_remaining(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    return df

train = impute_remaining(train)
test = impute_remaining(test)
print("Imputation done")

Imputation done


In [4]:
# Create features including NEW cabin region features
def create_features(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # Spending features
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    for col in spending_cols:
        df[f'{col}_ratio'] = df[col] / (df['TotalSpent'] + 1)
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
        df[f'{col}_log'] = np.log1p(df[col])
    
    df['LuxurySpent'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['BasicSpent'] = df['FoodCourt'] + df['ShoppingMall']
    df['LuxuryRatio'] = df['LuxurySpent'] / (df['TotalSpent'] + 1)
    df['SpentPerAge'] = df['TotalSpent'] / (df['Age'] + 1)
    df['SpendingBin'] = pd.cut(df['TotalSpent'], bins=[-1, 0, 500, 2000, float('inf')], labels=[0, 1, 2, 3]).astype(int)
    df['NumSpendingCategories'] = sum(df[f'{col}_spent'] for col in spending_cols)
    df['TotalSpent_log'] = np.log1p(df['TotalSpent'])
    df['LuxurySpent_log'] = np.log1p(df['LuxurySpent'])
    df['BasicSpent_log'] = np.log1p(df['BasicSpent'])
    df['Spa_VRDeck_RoomService'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['FoodCourt_RoomService'] = df['FoodCourt'] + df['RoomService']
    
    # Interaction features
    df['CryoSleep_HomePlanet'] = df['CryoSleep'].astype(str) + '_' + df['HomePlanet'].astype(str)
    df['CryoSleep_Destination'] = df['CryoSleep'].astype(str) + '_' + df['Destination'].astype(str)
    df['Deck_Side'] = df['Deck'].astype(str) + '_' + df['Side'].astype(str)
    df['HomePlanet_Destination'] = df['HomePlanet'].astype(str) + '_' + df['Destination'].astype(str)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 17, 25, 40, 60, 100], 
                            labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'MiddleAge', 'Senior']).astype(str)
    df['AgeGroup_CryoSleep'] = df['AgeGroup'] + '_' + df['CryoSleep'].astype(str)
    df['VIP_HomePlanet'] = df['VIP'].astype(str) + '_' + df['HomePlanet'].astype(str)
    
    # Age features
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    
    # NEW: Cabin region features (from top kernel)
    df['Cabin_region1'] = (df['CabinNum'] < 300).astype(int)
    df['Cabin_region2'] = ((df['CabinNum'] >= 300) & (df['CabinNum'] < 600)).astype(int)
    df['Cabin_region3'] = ((df['CabinNum'] >= 600) & (df['CabinNum'] < 900)).astype(int)
    df['Cabin_region4'] = ((df['CabinNum'] >= 900) & (df['CabinNum'] < 1200)).astype(int)
    df['Cabin_region5'] = ((df['CabinNum'] >= 1200) & (df['CabinNum'] < 1500)).astype(int)
    df['Cabin_region6'] = ((df['CabinNum'] >= 1500) & (df['CabinNum'] < 1800)).astype(int)
    df['Cabin_region7'] = (df['CabinNum'] >= 1800).astype(int)
    
    return df

train = create_features(train)
test = create_features(test)
print("Features created")

Features created


In [5]:
# Define categorical columns for target encoding
cat_cols_for_te = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side',
                   'CryoSleep_HomePlanet', 'CryoSleep_Destination', 'Deck_Side',
                   'HomePlanet_Destination', 'AgeGroup', 'AgeGroup_CryoSleep', 'VIP_HomePlanet']

# Convert to string for encoding
for col in cat_cols_for_te:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

# Numerical features (keep as-is)
num_features = [
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo', 'FamilySize',
    'TotalSpent', 'LuxurySpent', 'BasicSpent', 'LuxuryRatio', 'SpentPerAge', 'SpendingBin',
    'NumSpendingCategories', 'Spa_VRDeck_RoomService', 'FoodCourt_RoomService',
    'RoomService_ratio', 'FoodCourt_ratio', 'ShoppingMall_ratio', 'Spa_ratio', 'VRDeck_ratio',
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalSpent_log', 'LuxurySpent_log', 'BasicSpent_log',
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior',
    'Cabin_region1', 'Cabin_region2', 'Cabin_region3', 'Cabin_region4',
    'Cabin_region5', 'Cabin_region6', 'Cabin_region7'
]

y = train['Transported'].astype(int).values
print(f"Categorical columns for target encoding: {len(cat_cols_for_te)}")
print(f"Numerical features: {len(num_features)}")

Categorical columns for target encoding: 13
Numerical features: 51


In [None]:
# Train CatBoost with TARGET ENCODING (CV-based to avoid leakage)
cat_params = {
    'depth': 8,
    'learning_rate': 0.051,
    'iterations': 755,
    'l2_leaf_reg': 3.52,
    'random_seed': 42,
    'verbose': False
}

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train, y)):
    # Split data
    X_train_fold = train.iloc[train_idx].reset_index(drop=True)
    X_val_fold = train.iloc[val_idx].reset_index(drop=True)
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    X_test_fold = test.reset_index(drop=True)
    
    # Target encoding - fit on train fold only to avoid leakage
    te = TargetEncoder(cols=cat_cols_for_te, smoothing=1.0)
    te.fit(X_train_fold[cat_cols_for_te], y_train_fold)
    
    # Transform categorical columns
    X_train_te = te.transform(X_train_fold[cat_cols_for_te])
    X_val_te = te.transform(X_val_fold[cat_cols_for_te])
    X_test_te = te.transform(X_test_fold[cat_cols_for_te])
    
    # Combine with numerical features
    X_train_final = pd.concat([X_train_te.reset_index(drop=True), X_train_fold[num_features].reset_index(drop=True)], axis=1)
    X_val_final = pd.concat([X_val_te.reset_index(drop=True), X_val_fold[num_features].reset_index(drop=True)], axis=1)
    X_test_final = pd.concat([X_test_te.reset_index(drop=True), X_test_fold[num_features].reset_index(drop=True)], axis=1)
    
    # Train CatBoost
    model = CatBoostClassifier(**cat_params)
    model.fit(X_train_final, y_train_fold, eval_set=(X_val_final, y_val_fold), early_stopping_rounds=100)
    
    # Predictions
    oof_preds[val_idx] = model.predict_proba(X_val_final)[:, 1]
    test_preds += model.predict_proba(X_test_final)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val_fold, (oof_preds[val_idx] >= 0.5).astype(int))
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: {fold_acc:.5f}")

final_acc = accuracy_score(y, (oof_preds >= 0.5).astype(int))
print(f"\nTarget Encoding CatBoost CV: {final_acc:.5f} (+/- {np.std(fold_scores):.5f})")

In [None]:
# Compare with previous best
print("\n=== COMPARISON ===")
print(f"Target Encoding CV: {final_acc:.5f}")
print(f"exp_008 (multi-seed): 0.81698")
print(f"exp_003 (best LB):    0.81951")
print(f"\nImprovement over exp_008: {final_acc - 0.81698:+.5f}")
print(f"Gap to exp_003:          {final_acc - 0.81951:+.5f}")

# Check if we beat the threshold needed for LB improvement
print(f"\nTo beat LB 0.8045, need CV > 0.82089")
if final_acc > 0.82089:
    print(">>> SUBMIT THIS! CV exceeds threshold.")
else:
    print(f">>> CV is {0.82089 - final_acc:.5f} below threshold.")

In [None]:
# Create submission
test_binary = (test_preds >= 0.5).astype(bool)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved")
print(f"CV: {final_acc:.5f}")
print(f"Predicted transported rate: {test_binary.mean():.4f}")
print(f"Training transported rate: {y.mean():.4f}")
print(submission.head())