# Experiment 007: Weighted Ensemble Favoring CatBoost

Following evaluator's recommendation:
- DO NOT submit exp_005 (stacking) - predicted to underperform
- Try weighted ensemble: 0.6*CatBoost + 0.2*XGB + 0.2*LGB
- CatBoost is clearly the best single model
- Check prediction rate - target â‰¤ 51.7% (exp_003's rate with best LB)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

Train: (8693, 14), Test: (4277, 13)


In [2]:
# Feature engineering (same as previous experiments)
def feature_engineering(df):
    df = df.copy()
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else np.nan)
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else np.nan)
    return df

train = feature_engineering(train)
test = feature_engineering(test)

# GroupSize
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()
for df in [train, test]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)
print("Basic features done")

Basic features done


In [3]:
# Group-based imputation
def group_based_imputation(train_df, test_df):
    combined = pd.concat([train_df, test_df], ignore_index=True)
    for col in ['HomePlanet', 'Deck', 'Side']:
        group_mode = combined.groupby('Group')[col].apply(
            lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan).to_dict()
        for df in [train_df, test_df]:
            mask = df[col].isna()
            df.loc[mask, col] = df.loc[mask, 'Group'].map(group_mode)
    return train_df, test_df

train, test = group_based_imputation(train, test)

# Remaining imputation
def impute_remaining(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    return df

train = impute_remaining(train)
test = impute_remaining(test)
print("Imputation done")

Imputation done


In [4]:
# Create features
def create_features(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    for col in spending_cols:
        df[f'{col}_ratio'] = df[col] / (df['TotalSpent'] + 1)
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
        df[f'{col}_log'] = np.log1p(df[col])
    
    df['LuxurySpent'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['BasicSpent'] = df['FoodCourt'] + df['ShoppingMall']
    df['LuxuryRatio'] = df['LuxurySpent'] / (df['TotalSpent'] + 1)
    df['SpentPerAge'] = df['TotalSpent'] / (df['Age'] + 1)
    df['SpendingBin'] = pd.cut(df['TotalSpent'], bins=[-1, 0, 500, 2000, float('inf')], labels=[0, 1, 2, 3]).astype(int)
    df['NumSpendingCategories'] = sum(df[f'{col}_spent'] for col in spending_cols)
    df['TotalSpent_log'] = np.log1p(df['TotalSpent'])
    df['LuxurySpent_log'] = np.log1p(df['LuxurySpent'])
    df['BasicSpent_log'] = np.log1p(df['BasicSpent'])
    df['Spa_VRDeck_RoomService'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['FoodCourt_RoomService'] = df['FoodCourt'] + df['RoomService']
    
    df['CryoSleep_HomePlanet'] = df['CryoSleep'].astype(str) + '_' + df['HomePlanet'].astype(str)
    df['CryoSleep_Destination'] = df['CryoSleep'].astype(str) + '_' + df['Destination'].astype(str)
    df['Deck_Side'] = df['Deck'].astype(str) + '_' + df['Side'].astype(str)
    df['HomePlanet_Destination'] = df['HomePlanet'].astype(str) + '_' + df['Destination'].astype(str)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 17, 25, 40, 60, 100], 
                            labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'MiddleAge', 'Senior']).astype(str)
    df['AgeGroup_CryoSleep'] = df['AgeGroup'] + '_' + df['CryoSleep'].astype(str)
    df['VIP_HomePlanet'] = df['VIP'].astype(str) + '_' + df['HomePlanet'].astype(str)
    
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    return df

train = create_features(train)
test = create_features(test)
print("Features created")

Features created


In [5]:
# Encode categorical features
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side',
                'CryoSleep_HomePlanet', 'CryoSleep_Destination', 'Deck_Side',
                'HomePlanet_Destination', 'AgeGroup', 'AgeGroup_CryoSleep', 'VIP_HomePlanet']

for col in cat_features:
    le = LabelEncoder()
    combined = pd.concat([train[col].astype(str), test[col].astype(str)])
    le.fit(combined)
    train[col + '_enc'] = le.transform(train[col].astype(str))
    test[col + '_enc'] = le.transform(test[col].astype(str))

# Define feature columns
num_features = [
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo',
    'TotalSpent', 'LuxurySpent', 'BasicSpent', 'LuxuryRatio', 'SpentPerAge', 'SpendingBin',
    'NumSpendingCategories', 'Spa_VRDeck_RoomService', 'FoodCourt_RoomService',
    'RoomService_ratio', 'FoodCourt_ratio', 'ShoppingMall_ratio', 'Spa_ratio', 'VRDeck_ratio',
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalSpent_log', 'LuxurySpent_log', 'BasicSpent_log',
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior'
]

feature_cols = [col + '_enc' for col in cat_features] + num_features
X = train[feature_cols].values
y = train['Transported'].astype(int).values
X_test = test[feature_cols].values

print(f"X shape: {X.shape}, Features: {len(feature_cols)}")

X shape: (8693, 56), Features: 56


In [6]:
# Model hyperparameters - use best params from exp_003 for CatBoost
xgb_params = {
    'max_depth': 5,
    'learning_rate': 0.067,
    'n_estimators': 850,
    'reg_lambda': 3.06,
    'reg_alpha': 4.58,
    'colsample_bytree': 0.92,
    'subsample': 0.95,
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'logloss'
}

lgb_params = {
    'num_leaves': 330,  # Back to original params for better performance
    'learning_rate': 0.087,
    'n_estimators': 739,
    'feature_fraction': 0.66,
    'bagging_fraction': 0.87,
    'bagging_freq': 1,
    'lambda_l1': 6.18,
    'lambda_l2': 0.01,
    'random_state': 42,
    'verbose': -1
}

# Best CatBoost params from exp_003 (Optuna tuned)
cat_params = {
    'depth': 8,
    'learning_rate': 0.051,
    'iterations': 755,
    'l2_leaf_reg': 3.52,
    'random_seed': 42,
    'verbose': False
}

print("Model parameters defined")

Model parameters defined


In [7]:
# Train base models with 5-fold CV
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# OOF predictions
oof_xgb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

# Test predictions
test_xgb = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

# Fold scores
xgb_scores, lgb_scores, cat_scores = [], [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # XGBoost
    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    oof_xgb[val_idx] = model_xgb.predict_proba(X_val)[:, 1]
    test_xgb += model_xgb.predict_proba(X_test)[:, 1] / n_folds
    xgb_scores.append(accuracy_score(y_val, (oof_xgb[val_idx] >= 0.5).astype(int)))
    
    # LightGBM
    model_lgb = lgb.LGBMClassifier(**lgb_params)
    model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    oof_lgb[val_idx] = model_lgb.predict_proba(X_val)[:, 1]
    test_lgb += model_lgb.predict_proba(X_test)[:, 1] / n_folds
    lgb_scores.append(accuracy_score(y_val, (oof_lgb[val_idx] >= 0.5).astype(int)))
    
    # CatBoost
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    oof_cat[val_idx] = model_cat.predict_proba(X_val)[:, 1]
    test_cat += model_cat.predict_proba(X_test)[:, 1] / n_folds
    cat_scores.append(accuracy_score(y_val, (oof_cat[val_idx] >= 0.5).astype(int)))
    
    print(f"Fold {fold+1}: XGB={xgb_scores[-1]:.5f}, LGB={lgb_scores[-1]:.5f}, CAT={cat_scores[-1]:.5f}")

print(f"\nXGBoost Mean CV: {np.mean(xgb_scores):.5f} (+/- {np.std(xgb_scores):.5f})")
print(f"LightGBM Mean CV: {np.mean(lgb_scores):.5f} (+/- {np.std(lgb_scores):.5f})")
print(f"CatBoost Mean CV: {np.mean(cat_scores):.5f} (+/- {np.std(cat_scores):.5f})")

Fold 1: XGB=0.80851, LGB=0.81196, CAT=0.82001


Fold 2: XGB=0.80449, LGB=0.80736, CAT=0.81196


Fold 3: XGB=0.81024, LGB=0.80506, CAT=0.82231


Fold 4: XGB=0.82106, LGB=0.81530, CAT=0.81761


Fold 5: XGB=0.80207, LGB=0.79747, CAT=0.80898

XGBoost Mean CV: 0.80927 (+/- 0.00656)
LightGBM Mean CV: 0.80743 (+/- 0.00612)
CatBoost Mean CV: 0.81617 (+/- 0.00498)


In [8]:
# Weighted ensemble: 0.6*CatBoost + 0.2*XGB + 0.2*LGB
weights = [0.2, 0.2, 0.6]  # XGB, LGB, CatBoost

oof_weighted = weights[0]*oof_xgb + weights[1]*oof_lgb + weights[2]*oof_cat
test_weighted = weights[0]*test_xgb + weights[1]*test_lgb + weights[2]*test_cat

# Evaluate weighted ensemble
weighted_acc = accuracy_score(y, (oof_weighted >= 0.5).astype(int))

# Compare all approaches
xgb_acc = accuracy_score(y, (oof_xgb >= 0.5).astype(int))
lgb_acc = accuracy_score(y, (oof_lgb >= 0.5).astype(int))
cat_acc = accuracy_score(y, (oof_cat >= 0.5).astype(int))
simple_avg = (oof_xgb + oof_lgb + oof_cat) / 3
simple_avg_acc = accuracy_score(y, (simple_avg >= 0.5).astype(int))

print("=== COMPARISON ===")
print(f"XGBoost OOF:           {xgb_acc:.5f}")
print(f"LightGBM OOF:          {lgb_acc:.5f}")
print(f"CatBoost OOF:          {cat_acc:.5f}")
print(f"Simple Average:        {simple_avg_acc:.5f}")
print(f"Weighted (0.2/0.2/0.6): {weighted_acc:.5f}")
print(f"\nexp_003 (best LB):     0.81951")

=== COMPARISON ===
XGBoost OOF:           0.80927
LightGBM OOF:          0.80743
CatBoost OOF:          0.81617
Simple Average:        0.81456
Weighted (0.2/0.2/0.6): 0.81709

exp_003 (best LB):     0.81951


In [9]:
# Check prediction rates - CRITICAL for LB performance
print("\n=== PREDICTION RATES ===")
print(f"Training rate:          {y.mean():.4f}")
print(f"XGBoost pred rate:      {(test_xgb >= 0.5).mean():.4f}")
print(f"LightGBM pred rate:     {(test_lgb >= 0.5).mean():.4f}")
print(f"CatBoost pred rate:     {(test_cat >= 0.5).mean():.4f}")
print(f"Simple avg pred rate:   {((test_xgb + test_lgb + test_cat)/3 >= 0.5).mean():.4f}")
print(f"Weighted pred rate:     {(test_weighted >= 0.5).mean():.4f}")
print(f"\nexp_003 pred rate:      0.517 (best LB)")
print(f"exp_004 pred rate:      0.538 (worse LB)")


=== PREDICTION RATES ===
Training rate:          0.5036
XGBoost pred rate:      0.5018
LightGBM pred rate:     0.5027
CatBoost pred rate:     0.5198
Simple avg pred rate:   0.5050
Weighted pred rate:     0.5090

exp_003 pred rate:      0.517 (best LB)
exp_004 pred rate:      0.538 (worse LB)


In [10]:
# Create submission with threshold 0.5
test_binary = (test_weighted >= 0.5).astype(bool)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved")
print(f"Weighted ensemble CV: {weighted_acc:.5f}")
print(f"Predicted transported rate: {test_binary.mean():.4f}")
print(f"Training transported rate: {y.mean():.4f}")
print(submission.head())

Submission saved
Weighted ensemble CV: 0.81709
Predicted transported rate: 0.5090
Training transported rate: 0.5036
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
