# Target Encoding + Product Features Experiment

This notebook implements the evaluator's top priority: target encoding with proper cross-validation.

**Strategy Priority**: Priority 1 - Fix Critical Gaps

**Expected CV**: ~0.05-0.06 range

**Key Improvements**:
1. Target encoding for 'Sex' feature (sklearn's TargetEncoder with cv=5)
2. Product features (log1p + pairwise interactions)
3. Fix data leakage by moving binning inside CV loop
4. Apply to both XGBoost and CatBoost

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import TargetEncoder
import warnings
warnings.filterwarnings('ignore')

# Set random seed
SEED = 42
np.random.seed(SEED)

print("Loading data...")

# Load the synthetic data from workspace
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Columns: {list(train_df.columns)}")

Loading data...
Train shape: (8000, 9)
Test shape: (2000, 9)
Columns: ['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']


def create_product_features(df, numerical_features):
    """Create product features from numerical features"""
    df_feat = df.copy()
    
    # Create log1p versions
    for feature in numerical_features:
        df_feat[f'{feature}_log1p'] = np.log1p(df_feat[feature])
    
    # Create pairwise products (focus on most important combinations)
    # Based on winner insights: Weight*Duration, Duration*Heart_Rate, Height*Weight
    df_feat['product_Weight_Duration'] = df_feat['Weight'] * df_feat['Duration']
    df_feat['product_Duration_Heart_Rate'] = df_feat['Duration'] * df_feat['Heart_Rate']
    df_feat['product_Height_Weight'] = df_feat['Height'] * df_feat['Weight']
    df_feat['product_Age_Weight'] = df_feat['Age'] * df_feat['Weight']
    df_feat['product_Body_Temp_Duration'] = df_feat['Body_Temp'] * df_feat['Duration']
    
    # Create some ratios
    df_feat['ratio_Weight_Height'] = df_feat['Weight'] / (df_feat['Height'] + 1e-6)
    df_feat['ratio_Heart_Rate_Duration'] = df_feat['Heart_Rate'] / (df_feat['Duration'] + 1e-6)
    
    return df_feat

def create_binned_features(df, numerical_features, n_bins=15):
    """Create binned features (to be used inside CV loop)"""
    df_binned = df.copy()
    
    for feature in numerical_features:
        binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
        binned_values = binner.fit_transform(df[[feature]])
        df_binned[f'{feature}_binned'] = binned_values.astype(int)
    
    return df_binned

def create_target_encoding(X_train, y_train, X_val, X_test, categorical_features, cv=5):
    """Create target-encoded features using cross-validation"""
    
    X_train_enc = X_train.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    # Initialize TargetEncoder with internal cross-fitting
    encoder = TargetEncoder(cv=cv, smooth='auto', random_state=SEED)
    
    for feature in categorical_features:
        # Fit on training data
        encoder.fit(X_train[[feature]], y_train)
        
        # Transform all datasets - convert to 1D arrays
        train_encoded = encoder.transform(X_train[[feature]])
        val_encoded = encoder.transform(X_val[[feature]])
        test_encoded = encoder.transform(X_test[[feature]])
        
        # Add as new features
        X_train_enc[f'{feature}_target_enc'] = train_encoded
        X_val_enc[f'{feature}_target_enc'] = val_encoded
        X_test_enc[f'{feature}_target_enc'] = test_encoded
    
    return X_train_enc, X_val_enc, X_test_enc

In [2]:
def create_product_features(df, numerical_features):
    """Create product features from numerical features"""
    df_feat = df.copy()
    
    # Create log1p versions
    for feature in numerical_features:
        df_feat[f'{feature}_log1p'] = np.log1p(df_feat[feature])
    
    # Create pairwise products (focus on most important combinations)
    # Based on winner insights: Weight*Duration, Duration*Heart_Rate, Height*Weight
    df_feat['product_Weight_Duration'] = df_feat['Weight'] * df_feat['Duration']
    df_feat['product_Duration_Heart_Rate'] = df_feat['Duration'] * df_feat['Heart_Rate']
    df_feat['product_Height_Weight'] = df_feat['Height'] * df_feat['Weight']
    df_feat['product_Age_Weight'] = df_feat['Age'] * df_feat['Weight']
    df_feat['product_Body_Temp_Duration'] = df_feat['Body_Temp'] * df_feat['Duration']
    
    # Create some ratios
    df_feat['ratio_Weight_Height'] = df_feat['Weight'] / (df_feat['Height'] + 1e-6)
    df_feat['ratio_Heart_Rate_Duration'] = df_feat['Heart_Rate'] / (df_feat['Duration'] + 1e-6)
    
    return df_feat

def create_binned_features(df, numerical_features, n_bins=15):
    """Create binned features (to be used inside CV loop)"""
    df_binned = df.copy()
    
    for feature in numerical_features:
        binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
        binned_values = binner.fit_transform(df[[feature]])
        df_binned[f'{feature}_binned'] = binned_values.astype(int)
    
    return df_binned

def create_target_encoding(X_train, y_train, X_val, X_test, categorical_features, cv=5):
    """Create target-encoded features using cross-validation"""
    
    X_train_enc = X_train.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    # Initialize TargetEncoder with internal cross-fitting
    encoder = TargetEncoder(cv=cv, smooth='auto', random_state=SEED)
    
    for feature in categorical_features:
        # Fit on training data
        encoder.fit(X_train[[feature]], y_train)
        
        # Transform all datasets
        X_train_enc[f'{feature}_target_enc'] = encoder.transform(X_train[[feature]]).values.ravel()
        X_val_enc[f'{feature}_target_enc'] = encoder.transform(X_val[[feature]]).values.ravel()
        X_test_enc[f'{feature}_target_enc'] = encoder.transform(X_test[[feature]]).values.ravel()
    
    return X_train_enc, X_val_enc, X_test_enc

## Prepare Cross-Validation

Use 5-fold CV with seed 42 as specified in the strategy.

In [None]:
# Cross-validation setup
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=SEED)

# Store predictions
oof_predictions_xgb = np.zeros(len(train_df))
test_predictions_xgb = np.zeros(len(test_df))

oof_predictions_cat = np.zeros(len(train_df))
test_predictions_cat = np.zeros(len(test_df))

cv_scores_xgb = []
cv_scores_cat = []

print(f"Starting {n_folds}-fold CV...")

# Define numerical and categorical features
numerical_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
categorical_features = ['Sex']

## Train Models with Advanced Features

Train both XGBoost and CatBoost with target encoding and product features.

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool

# Model parameters
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': SEED,
    'n_jobs': -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

cat_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': SEED,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'CPU'
}

fold = 1
for train_idx, val_idx in kf.split(train_df):
    print(f"\n{'='*60}")
    print(f"Training fold {fold}/{n_folds}")
    print(f"{'='*60}")
    
    # Split data
    X_train, X_val = train_df.iloc[train_idx], train_df.iloc[val_idx]
    y_train, y_val = X_train['Calories'].copy(), X_val['Calories'].copy()
    
    # Remove target from features
    X_train = X_train.drop('Calories', axis=1)
    X_val = X_val.drop('Calories', axis=1)
    
    # Create product features (log1p + interactions)
    X_train = create_product_features(X_train, numerical_features)
    X_val = create_product_features(X_val, numerical_features)
    X_test_feat = create_product_features(test_df.copy(), numerical_features)
    
    # Create binned features (inside CV loop to prevent leakage)
    X_train = create_binned_features(X_train, numerical_features, n_bins=15)
    X_val = create_binned_features(X_val, numerical_features, n_bins=15)
    X_test_feat = create_binned_features(X_test_feat, numerical_features, n_bins=15)
    
    # Create target encoding (critical - uses internal CV)
    X_train_enc, X_val_enc, X_test_enc = create_target_encoding(
        X_train, y_train, X_val, X_test_feat, categorical_features, cv=5
    )
    
    # Define feature columns for each model
    # XGBoost: use all features including target encoding
    xgb_features = [col for col in X_train_enc.columns if col != 'id']
    
    # CatBoost: use original categorical + binned features
    cat_features = categorical_features + [col for col in X_train_enc.columns if col.endswith('_binned')]
    cat_feature_indices = [xgb_features.index(col) for col in cat_features if col in xgb_features]
    
    print(f"Total features: {len(xgb_features)}")
    print(f"CatBoost categorical features: {len(cat_feature_indices)}")
    
    # Train XGBoost
    print("\nTraining XGBoost...")
    model_xgb = XGBRegressor(**xgb_params)
    model_xgb.fit(
        X_train_enc[xgb_features], y_train,
        eval_set=[(X_val_enc[xgb_features], y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Predict with XGBoost
    val_pred_xgb = model_xgb.predict(X_val_enc[xgb_features])
    oof_predictions_xgb[val_idx] = val_pred_xgb
    
    # Train CatBoost
    print("\nTraining CatBoost...")
    train_pool = Pool(X_train_enc[xgb_features], y_train, cat_features=cat_feature_indices)
    val_pool = Pool(X_val_enc[xgb_features], y_val, cat_features=cat_feature_indices)
    
    model_cat = CatBoostRegressor(**cat_params)
    model_cat.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
    
    # Predict with CatBoost
    val_pred_cat = model_cat.predict(val_pool)
    oof_predictions_cat[val_idx] = val_pred_cat
    
    # Calculate RMSLE for this fold
    rmsle_xgb = np.sqrt(mean_squared_log_error(y_val, np.clip(val_pred_xgb, 0, None)))
    rmsle_cat = np.sqrt(mean_squared_log_error(y_val, np.clip(val_pred_cat, 0, None)))
    
    cv_scores_xgb.append(rmsle_xgb)
    cv_scores_cat.append(rmsle_cat)
    
    print(f"\nFold {fold} RMSLE - XGBoost: {rmsle_xgb:.6f}, CatBoost: {rmsle_cat:.6f}")
    
    # Predict on test set
    test_pred_xgb = model_xgb.predict(X_test_enc[xgb_features])
    test_predictions_xgb += test_pred_xgb / n_folds
    
    test_pred_cat = model_cat.predict(X_test_enc[xgb_features])
    test_predictions_cat += test_pred_cat / n_folds
    
    fold += 1

## Results Analysis

In [None]:
# Calculate overall CV scores
mean_rmsle_xgb = np.mean(cv_scores_xgb)
std_rmsle_xgb = np.std(cv_scores_xgb)

mean_rmsle_cat = np.mean(cv_scores_cat)
std_rmsle_cat = np.std(cv_scores_cat)

print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(f"\nXGBoost CV RMSLE: {mean_rmsle_xgb:.6f} ± {std_rmsle_xgb:.6f}")
print(f"CatBoost CV RMSLE: {mean_rmsle_cat:.6f} ± {std_rmsle_cat:.6f}")
print(f"\nIndividual folds:")
for i, (score_xgb, score_cat) in enumerate(zip(cv_scores_xgb, cv_scores_cat)):
    print(f"  Fold {i+1}: XGBoost={score_xgb:.6f}, CatBoost={score_cat:.6f}")

# Compare to baseline
print(f"\nComparison to baseline:")
print(f"  Baseline XGBoost: 0.020470")
print(f"  This XGBoost:     {mean_rmsle_xgb:.6f} ({mean_rmsle_xgb-0.020470:+.6f})")
print(f"  Baseline CatBoost: 0.202383")
print(f"  This CatBoost:     {mean_rmsle_cat:.6f} ({mean_rmsle_cat-0.202383:+.6f})")

# Check if we're in target range
target_range = (0.055, 0.065)
print(f"\nTarget range: {target_range}")
print(f"XGBoost in range: {target_range[0] <= mean_rmsle_xgb <= target_range[1]}")
print(f"CatBoost in range: {target_range[0] <= mean_rmsle_cat <= target_range[1]}")