# Experiment 005: Threshold Tuning + CatBoost Native Categorical Handling

Following strategy priorities:
1. Threshold tuning (quick win, hasn't been tried)
2. CatBoost native categorical handling (research suggests it outperforms label encoding)

Goal: Improve LB generalization, not just CV. Current CV-LB gap is 1.5%.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")

Train: (8693, 14), Test: (4277, 13)


In [2]:
# Feature engineering (same as previous experiments)
def feature_engineering(df):
    df = df.copy()
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else 'Unknown')
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else 'Unknown')
    return df

train = feature_engineering(train)
test = feature_engineering(test)

# GroupSize
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()
for df in [train, test]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)
print("Basic features done")

Basic features done


In [3]:
# Group-based imputation
def group_based_imputation(train_df, test_df):
    combined = pd.concat([train_df, test_df], ignore_index=True)
    
    for col in ['HomePlanet', 'Deck', 'Side']:
        group_mode = combined.groupby('Group')[col].apply(
            lambda x: x.mode()[0] if len(x.mode()) > 0 and x.mode()[0] != 'Unknown' else 'Unknown'
        ).to_dict()
        
        for df in [train_df, test_df]:
            mask = (df[col].isna()) | (df[col] == 'Unknown')
            df.loc[mask, col] = df.loc[mask, 'Group'].map(group_mode)
    
    return train_df, test_df

train, test = group_based_imputation(train, test)

# Remaining imputation
def impute_remaining(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # CryoSleep passengers have 0 spending
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    
    # Categorical - fill with mode
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
        if df[col].isna().any() or (df[col] == 'Unknown').any():
            mode_val = df[col].replace('Unknown', np.nan).mode()[0]
            df[col] = df[col].replace('Unknown', mode_val)
            df[col] = df[col].fillna(mode_val)
    
    # Numerical - fill with median
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    
    return df

train = impute_remaining(train)
test = impute_remaining(test)
print("Imputation done")

Imputation done


In [4]:
# Create spending and interaction features
def create_features(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # Spending features
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    for col in spending_cols:
        df[f'{col}_ratio'] = df[col] / (df['TotalSpent'] + 1)
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
        df[f'{col}_log'] = np.log1p(df[col])
    
    df['LuxurySpent'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['BasicSpent'] = df['FoodCourt'] + df['ShoppingMall']
    df['LuxuryRatio'] = df['LuxurySpent'] / (df['TotalSpent'] + 1)
    df['SpentPerAge'] = df['TotalSpent'] / (df['Age'] + 1)
    df['SpendingBin'] = pd.cut(df['TotalSpent'], bins=[-1, 0, 500, 2000, float('inf')], labels=[0, 1, 2, 3]).astype(int)
    df['NumSpendingCategories'] = sum(df[f'{col}_spent'] for col in spending_cols)
    df['TotalSpent_log'] = np.log1p(df['TotalSpent'])
    df['LuxurySpent_log'] = np.log1p(df['LuxurySpent'])
    df['BasicSpent_log'] = np.log1p(df['BasicSpent'])
    df['Spa_VRDeck_RoomService'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['FoodCourt_RoomService'] = df['FoodCourt'] + df['RoomService']
    
    # Interaction features (keep as strings for CatBoost native handling)
    df['CryoSleep_HomePlanet'] = df['CryoSleep'].astype(str) + '_' + df['HomePlanet'].astype(str)
    df['CryoSleep_Destination'] = df['CryoSleep'].astype(str) + '_' + df['Destination'].astype(str)
    df['Deck_Side'] = df['Deck'].astype(str) + '_' + df['Side'].astype(str)
    df['HomePlanet_Destination'] = df['HomePlanet'].astype(str) + '_' + df['Destination'].astype(str)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 17, 25, 40, 60, 100], 
                            labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'MiddleAge', 'Senior']).astype(str)
    df['AgeGroup_CryoSleep'] = df['AgeGroup'] + '_' + df['CryoSleep'].astype(str)
    df['VIP_HomePlanet'] = df['VIP'].astype(str) + '_' + df['HomePlanet'].astype(str)
    
    # Age features
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    
    return df

train = create_features(train)
test = create_features(test)
print("All features created")

All features created


In [5]:
# Define features - using native categorical handling for CatBoost
# Categorical features (keep as strings)
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side',
                'CryoSleep_HomePlanet', 'CryoSleep_Destination', 'Deck_Side',
                'HomePlanet_Destination', 'AgeGroup', 'AgeGroup_CryoSleep', 'VIP_HomePlanet']

# Numerical features
num_features = [
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo',
    'TotalSpent', 'LuxurySpent', 'BasicSpent', 'LuxuryRatio', 'SpentPerAge', 'SpendingBin',
    'NumSpendingCategories', 'Spa_VRDeck_RoomService', 'FoodCourt_RoomService',
    'RoomService_ratio', 'FoodCourt_ratio', 'ShoppingMall_ratio', 'Spa_ratio', 'VRDeck_ratio',
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalSpent_log', 'LuxurySpent_log', 'BasicSpent_log',
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior'
]

feature_cols = cat_features + num_features
cat_indices = list(range(len(cat_features)))  # First N columns are categorical

# Convert categorical columns to string type for CatBoost
for col in cat_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

X = train[feature_cols]
y = train['Transported'].astype(int).values
X_test = test[feature_cols]

print(f"X shape: {X.shape}, Features: {len(feature_cols)}")
print(f"Categorical features: {len(cat_features)}, Numerical: {len(num_features)}")

X shape: (8693, 56), Features: 56
Categorical features: 13, Numerical: 43


In [6]:
# Train CatBoost with native categorical handling
cat_params = {
    'depth': 8,
    'learning_rate': 0.051,
    'iterations': 755,
    'l2_leaf_reg': 3.52,
    'random_seed': 42,
    'verbose': False,
    'cat_features': cat_indices  # Native categorical handling!
}

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(**cat_params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val, (oof_preds[val_idx] >= 0.5).astype(int))
    fold_scores.append(fold_acc)
    print(f"Fold {fold+1}: {fold_acc:.5f}")

baseline_acc = accuracy_score(y, (oof_preds >= 0.5).astype(int))
print(f"\nCatBoost Native Cat OOF Accuracy: {baseline_acc:.5f} (+/- {np.std(fold_scores):.5f})")

Fold 1: 0.81944


Fold 2: 0.80851


Fold 3: 0.81484


Fold 4: 0.82911


Fold 5: 0.80898

CatBoost Native Cat OOF Accuracy: 0.81617 (+/- 0.00762)


In [7]:
# THRESHOLD TUNING - find optimal threshold
print("\n=== THRESHOLD TUNING ===")
thresholds = np.arange(0.40, 0.60, 0.01)
best_threshold = 0.5
best_acc = 0

for t in thresholds:
    acc = accuracy_score(y, (oof_preds >= t).astype(int))
    if acc > best_acc:
        best_acc = acc
        best_threshold = t
    print(f"Threshold {t:.2f}: {acc:.5f}")

print(f"\nBest threshold: {best_threshold:.2f} with accuracy {best_acc:.5f}")
print(f"Improvement from threshold tuning: {best_acc - baseline_acc:+.5f}")


=== THRESHOLD TUNING ===
Threshold 0.40: 0.81077
Threshold 0.41: 0.81180
Threshold 0.42: 0.81226
Threshold 0.43: 0.81341
Threshold 0.44: 0.81433
Threshold 0.45: 0.81537
Threshold 0.46: 0.81824
Threshold 0.47: 0.81928
Threshold 0.48: 0.81847
Threshold 0.49: 0.81686
Threshold 0.50: 0.81617
Threshold 0.51: 0.81583
Threshold 0.52: 0.81571
Threshold 0.53: 0.81663
Threshold 0.54: 0.81617
Threshold 0.55: 0.81629
Threshold 0.56: 0.81399
Threshold 0.57: 0.81261
Threshold 0.58: 0.81192
Threshold 0.59: 0.81226

Best threshold: 0.47 with accuracy 0.81928
Improvement from threshold tuning: +0.00311


In [8]:
# Compare to previous best (exp_003: CV 0.81951)
exp003_cv = 0.81951
print(f"\n=== COMPARISON ===")
print(f"exp_003 (tuned CatBoost, label encoding): {exp003_cv:.5f}")
print(f"This exp (native cat, threshold={best_threshold:.2f}): {best_acc:.5f}")
print(f"Improvement: {best_acc - exp003_cv:+.5f}")


=== COMPARISON ===
exp_003 (tuned CatBoost, label encoding): 0.81951
This exp (native cat, threshold=0.47): 0.81928
Improvement: -0.00023


In [9]:
# Create submission with best threshold
test_binary = (test_preds >= best_threshold).astype(bool)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with threshold={best_threshold:.2f}")
print(f"Predicted transported rate: {test_binary.mean():.4f}")
print(f"Training transported rate: {y.mean():.4f}")
print(submission.head())

Submission saved with threshold=0.47
Predicted transported rate: 0.5378
Training transported rate: 0.5036
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


In [None]:
# The native categorical handling didn't help - let's try threshold tuning on the original label-encoded model
# Re-train with label encoding to get OOF predictions for threshold tuning

from sklearn.preprocessing import LabelEncoder

# Reload and process data with label encoding
train_le = pd.read_csv('/home/data/train.csv')
test_le = pd.read_csv('/home/data/test.csv')

# Apply same feature engineering
train_le = feature_engineering(train_le)
test_le = feature_engineering(test_le)

# GroupSize
for df in [train_le, test_le]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)

train_le, test_le = group_based_imputation(train_le, test_le)
train_le = impute_remaining(train_le)
test_le = impute_remaining(test_le)
train_le = create_features(train_le)
test_le = create_features(test_le)

# Label encode categorical features
for col in cat_features:
    le = LabelEncoder()
    combined = pd.concat([train_le[col].astype(str), test_le[col].astype(str)])
    le.fit(combined)
    train_le[col + '_enc'] = le.transform(train_le[col].astype(str))
    test_le[col + '_enc'] = le.transform(test_le[col].astype(str))

# Use encoded features
feature_cols_le = [col + '_enc' for col in cat_features] + num_features
X_le = train_le[feature_cols_le].values
X_test_le = test_le[feature_cols_le].values

print(f"Label-encoded features: {len(feature_cols_le)}")

In [None]:
# Train label-encoded CatBoost and apply threshold tuning
cat_params_le = {
    'depth': 8,
    'learning_rate': 0.051,
    'iterations': 755,
    'l2_leaf_reg': 3.52,
    'random_seed': 42,
    'verbose': False
}

oof_le = np.zeros(len(X_le))
test_le_preds = np.zeros(len(X_test_le))
fold_scores_le = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_le, y)):
    X_train, X_val = X_le[train_idx], X_le[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(**cat_params_le)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    
    oof_le[val_idx] = model.predict_proba(X_val)[:, 1]
    test_le_preds += model.predict_proba(X_test_le)[:, 1] / n_folds
    
    fold_acc = accuracy_score(y_val, (oof_le[val_idx] >= 0.5).astype(int))
    fold_scores_le.append(fold_acc)
    print(f"Fold {fold+1}: {fold_acc:.5f}")

le_baseline_acc = accuracy_score(y, (oof_le >= 0.5).astype(int))
print(f"\nLabel-Encoded CatBoost OOF Accuracy: {le_baseline_acc:.5f} (+/- {np.std(fold_scores_le):.5f})")

In [None]:
# Threshold tuning on label-encoded model
print("\\n=== THRESHOLD TUNING (Label-Encoded) ===")
best_threshold_le = 0.5
best_acc_le = 0

for t in thresholds:
    acc = accuracy_score(y, (oof_le >= t).astype(int))
    if acc > best_acc_le:
        best_acc_le = acc
        best_threshold_le = t
    print(f"Threshold {t:.2f}: {acc:.5f}")

print(f"\\nBest threshold: {best_threshold_le:.2f} with accuracy {best_acc_le:.5f}")
print(f"Improvement from threshold tuning: {best_acc_le - le_baseline_acc:+.5f}")

# Final comparison
print(f"\\n=== FINAL COMPARISON ===")
print(f"exp_003 (tuned CatBoost, threshold=0.5): 0.81951")
print(f"Native cat + threshold={best_threshold:.2f}: {best_acc:.5f}")
print(f"Label-encoded + threshold={best_threshold_le:.2f}: {best_acc_le:.5f}")

In [None]:
# Save the best submission
if best_acc_le > best_acc:
    print(f"Label-encoded with threshold {best_threshold_le:.2f} is best!")
    test_binary_final = (test_le_preds >= best_threshold_le).astype(bool)
    final_acc = best_acc_le
    final_threshold = best_threshold_le
else:
    print(f"Native cat with threshold {best_threshold:.2f} is best!")
    test_binary_final = (test_preds >= best_threshold).astype(bool)
    final_acc = best_acc
    final_threshold = best_threshold

submission_final = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_binary_final
})

submission_final.to_csv('/home/submission/submission.csv', index=False)
print(f"\\nFinal submission saved with threshold={final_threshold:.2f}")
print(f"Final CV accuracy: {final_acc:.5f}")
print(f"Predicted transported rate: {test_binary_final.mean():.4f}")