# Simple Target Encoding Implementation

Clean implementation avoiding dtype issues.

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import KBinsDiscretizer
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

print("Loading data...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

print(f"Train: {train_df.shape}, Test: {test_df.shape}")

Loading data...
Train: (8000, 9), Test: (2000, 9)


In [6]:
def get_target_encoding(X_train, y_train, X_val, X_test, col='Sex', smoothing=20):
    """Simple target encoding with smoothing"""
    # Global mean
    global_mean = y_train.mean()
    
    # Category stats from training data - y_train is numpy array, need to use X_train
    stats = pd.DataFrame({
        'target': y_train,
        'category': X_train[col]
    }).groupby('category')['target'].agg(['count', 'mean'])
    
    # Smoothing function
    def encode(series):
        result = []
        for val in series:
            if val in stats.index:
                count = stats.loc[val, 'count']
                mean = stats.loc[val, 'mean']
                # Apply smoothing
                smoothed = (count * mean + smoothing * global_mean) / (count + smoothing)
                result.append(smoothed)
            else:
                result.append(global_mean)
        return np.array(result)
    
    # Apply encoding
    X_train_enc = X_train.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    X_train_enc[f'{col}_target_enc'] = encode(X_train[col])
    X_val_enc[f'{col}_target_enc'] = encode(X_val[col])
    X_test_enc[f'{col}_target_enc'] = encode(X_test[col])
    
    return X_train_enc, X_val_enc, X_test_enc

def add_features(df, num_cols):
    """Add engineered features"""
    df_new = df.copy()
    
    # Log transforms
    for col in num_cols:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    
    # Products
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    df_new['Duration_Heart_Rate'] = df_new['Duration'] * df_new['Heart_Rate']
    df_new['Height_Weight'] = df_new['Height'] * df_new['Weight']
    
    # Ratios
    df_new['Weight_Height'] = df_new['Weight'] / (df_new['Height'] + 1e-6)
    
    return df_new

def add_binned_features(df, num_cols, n_bins=15):
    """Add binned features"""
    df_new = df.copy()
    
    for col in num_cols:
        binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
        binned = binner.fit_transform(df_new[[col]])
        df_new[f'{col}_binned'] = binned.astype(int)
    
    return df_new

In [7]:
# Setup
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=SEED)

oof_xgb = np.zeros(len(train_df))
test_xgb = np.zeros(len(test_df))

oof_cat = np.zeros(len(train_df))
test_cat = np.zeros(len(test_df))

scores_xgb = []
scores_cat = []

num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

print("Starting CV...")

Starting CV...


In [8]:
# Model params
xgb_params = {
    'n_estimators': 800,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': SEED,
    'n_jobs': -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

cat_params = {
    'iterations': 800,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': SEED,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'CPU'
}

fold = 1
for train_idx, val_idx in kf.split(train_df):
    print(f"\nFold {fold}/{n_folds}")
    
    # Split
    X_tr, X_va = train_df.iloc[train_idx].copy(), train_df.iloc[val_idx].copy()
    y_tr, y_va = X_tr['Calories'].values, X_va['Calories'].values
    
    X_tr = X_tr.drop('Calories', axis=1)
    X_va = X_va.drop('Calories', axis=1)
    X_te = test_df.copy()
    
    # Add features
    X_tr = add_features(X_tr, num_features)
    X_va = add_features(X_va, num_features)
    X_te = add_features(X_te, num_features)
    
    # Add binned features
    X_tr = add_binned_features(X_tr, num_features)
    X_va = add_binned_features(X_va, num_features)
    X_te = add_binned_features(X_te, num_features)
    
    # Target encoding
    X_tr_enc, X_va_enc, X_te_enc = get_target_encoding(X_tr, y_tr, X_va, X_te)
    
    # Prepare features - drop original Sex, keep target encoding
    features = [c for c in X_tr_enc.columns if c != 'id' and c != 'Sex']
    
    # XGBoost data (ensure all numeric)
    X_tr_xgb = X_tr_enc[features].astype(float)
    X_va_xgb = X_va_enc[features].astype(float)
    X_te_xgb = X_te_enc[features].astype(float)
    
    # CatBoost categorical features (binned features)
    cat_features = [c for c in features if c.endswith('_binned')]
    cat_indices = [features.index(c) for c in cat_features]
    
    print(f"Features: {len(features)}, Cat features: {len(cat_features)}")
    
    # Train XGBoost
    model_xgb = XGBRegressor(**xgb_params)
    model_xgb.fit(X_tr_xgb, y_tr)
    
    pred_va_xgb = model_xgb.predict(X_va_xgb)
    oof_xgb[val_idx] = pred_va_xgb
    
    # Train CatBoost
    train_pool = Pool(X_tr_enc[features], y_tr, cat_features=cat_indices)
    val_pool = Pool(X_va_enc[features], y_va, cat_features=cat_indices)
    
    model_cat = CatBoostRegressor(**cat_params)
    model_cat.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
    
    pred_va_cat = model_cat.predict(val_pool)
    oof_cat[val_idx] = pred_va_cat
    
    # Scores
    score_xgb = np.sqrt(mean_squared_log_error(y_va, np.clip(pred_va_xgb, 0, None)))
    score_cat = np.sqrt(mean_squared_log_error(y_va, np.clip(pred_va_cat, 0, None)))
    
    scores_xgb.append(score_xgb)
    scores_cat.append(score_cat)
    
    print(f"RMSLE - XGBoost: {score_xgb:.6f}, CatBoost: {score_cat:.6f}")
    
    # Test predictions
    test_xgb += model_xgb.predict(X_te_xgb) / n_folds
    test_cat += model_cat.predict(X_te_enc[features]) / n_folds
    
    fold += 1


Fold 1/5
Features: 23, Cat features: 6


RMSLE - XGBoost: 0.209380, CatBoost: 0.201078

Fold 2/5
Features: 23, Cat features: 6


RMSLE - XGBoost: 0.204923, CatBoost: 0.195446

Fold 3/5
Features: 23, Cat features: 6


RMSLE - XGBoost: 0.216927, CatBoost: 0.205409

Fold 4/5
Features: 23, Cat features: 6


RMSLE - XGBoost: 0.220970, CatBoost: 0.212426

Fold 5/5
Features: 23, Cat features: 6


RMSLE - XGBoost: 0.205605, CatBoost: 0.194318


In [9]:
# Results
mean_xgb = np.mean(scores_xgb)
std_xgb = np.std(scores_xgb)

mean_cat = np.mean(scores_cat)
std_cat = np.std(scores_cat)

print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)
print(f"XGBoost: {mean_xgb:.6f} ± {std_xgb:.6f}")
print(f"CatBoost: {mean_cat:.6f} ± {std_cat:.6f}")
print(f"\nIndividual folds: {scores_xgb}")
print(f"Individual folds: {scores_cat}")

# Compare to baseline
print(f"\nComparison:")
print(f"  Baseline XGBoost: 0.020470")
print(f"  This XGBoost:     {mean_xgb:.6f} ({mean_xgb-0.020470:+.6f})")
print(f"  Baseline CatBoost: 0.202383")
print(f"  This CatBoost:     {mean_cat:.6f} ({mean_cat-0.202383:+.6f})")

# Target range
target = (0.055, 0.065)
print(f"\nTarget range: {target}")
print(f"XGBoost in range: {target[0] <= mean_xgb <= target[1]}")
print(f"CatBoost in range: {target[0] <= mean_cat <= target[1]}")


FINAL RESULTS
XGBoost: 0.211561 ± 0.006350
CatBoost: 0.201736 ± 0.006675

Individual folds: [0.20938008156178967, 0.20492327279021907, 0.21692692791378476, 0.22097016177492476, 0.20560541043184194]
Individual folds: [0.20107822177280574, 0.1954460192587355, 0.2054091662416508, 0.21242649429114654, 0.1943183137535832]

Comparison:
  Baseline XGBoost: 0.020470
  This XGBoost:     0.211561 (+0.191091)
  Baseline CatBoost: 0.202383
  This CatBoost:     0.201736 (-0.000647)

Target range: (0.055, 0.065)
XGBoost in range: False
CatBoost in range: False


In [10]:
# Create submissions
train_min = train_df['Calories'].min()
train_max = train_df['Calories'].max()

# XGBoost submission
sub_xgb = pd.DataFrame({
    'id': test_df['id'],
    'Calories': np.clip(test_xgb, train_min, train_max)
})
sub_xgb.to_csv('/home/submission/submission_003_xgb_simple.csv', index=False)

# CatBoost submission
sub_cat = pd.DataFrame({
    'id': test_df['id'],
    'Calories': np.clip(test_cat, train_min, train_max)
})
sub_cat.to_csv('/home/submission/submission_003_cat_simple.csv', index=False)

print("Submissions saved:")
print("  /home/submission/submission_003_xgb_simple.csv")
print("  /home/submission/submission_003_cat_simple.csv")

# Save OOF predictions
pd.DataFrame({
    'id': train_df['id'],
    'oof_prediction': oof_xgb
}).to_csv('/home/code/experiments/oof_003_xgb_simple.csv', index=False)

pd.DataFrame({
    'id': train_df['id'],
    'oof_prediction': oof_cat
}).to_csv('/home/code/experiments/oof_003_cat_simple.csv', index=False)

print("\nOOF predictions saved:")
print("  /home/code/experiments/oof_003_xgb_simple.csv")
print("  /home/code/experiments/oof_003_cat_simple.csv")

Submissions saved:
  /home/submission/submission_003_xgb_simple.csv
  /home/submission/submission_003_cat_simple.csv

OOF predictions saved:
  /home/code/experiments/oof_003_xgb_simple.csv
  /home/code/experiments/oof_003_cat_simple.csv
