# Experiment 003: Winning Kernel Approach

Implementing the winning kernel's techniques:
- ExtractFeatures: BMI calculation and column rounding
- MEstimateEncoder on select categorical features
- 9-fold CV
- XGBoost with tuned hyperparameters

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
import os

# Set seeds for reproducibility
RANDOM_SEED = 73
np.random.seed(RANDOM_SEED)

In [2]:
# Load data
FILE_PATH = "/home/data"
train = pd.read_csv(os.path.join(FILE_PATH, "train.csv"))
test = pd.read_csv(os.path.join(FILE_PATH, "test.csv"))
sample_sub = pd.read_csv(os.path.join(FILE_PATH, "sample_submission.csv"))

TARGET = "NObeyesdad"
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:\n{train[TARGET].value_counts()}")

Train shape: (20758, 18)
Test shape: (13840, 17)
Target distribution:
NObeyesdad
Obesity_Type_III       4046
Obesity_Type_II        3248
Normal_Weight          3082
Obesity_Type_I         2910
Insufficient_Weight    2523
Overweight_Level_II    2522
Overweight_Level_I     2427
Name: count, dtype: int64


In [3]:
# Feature engineering functions from winning kernel
def extract_features(x):
    """Extract BMI and round certain columns"""
    x_copy = x.copy()
    # Calculate BMI
    x_copy['BMI'] = x_copy['Weight'] / (x_copy['Height'] ** 2)
    
    # Round specific columns (as done in winning kernel)
    cols_to_round = ['FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
    for col in cols_to_round:
        x_copy[col] = round(x_copy[col])
        x_copy[col] = x_copy[col].astype('int')
    
    return x_copy

# Create transformer
ExtractFeatures = FunctionTransformer(extract_features)

In [4]:
# Define columns for MEstimateEncoder (from winning kernel)
# These are the 8 columns they used MEstimateEncoder on
cols_to_encode = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 
                  'SMOKE', 'SCC', 'CALC', 'MTRANS']

print(f"Columns to encode with MEstimateEncoder: {cols_to_encode}")
print(f"Number of columns: {len(cols_to_encode)}")

Columns to encode with MEstimateEncoder: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
Number of columns: 8


In [None]:
# Prepare data with feature extraction
X = train.drop(columns=[TARGET, 'id'])
y = train[TARGET]
X_test = test.drop(columns=['id'])

# Apply feature extraction
X = extract_features(X)
X_test = extract_features(X_test)

print(f"Feature extracted X shape: {X.shape}")
print(f"New features: {[col for col in X.columns if col not in train.columns]}")
print(f"BMI stats: min={X['BMI'].min():.2f}, max={X['BMI'].max():.2f}, mean={X['BMI'].mean():.2f}")

In [None]:
# Define XGBoost hyperparameters from winning kernel
# These are the tuned parameters they used
xgb_params = {
    'n_estimators': 131,
    'max_depth': 5,
    'learning_rate': 0.0937675874371929,
    'subsample': 0.8552791251123193,
    'colsample_bytree': 0.8490259266383225,
    'min_child_weight': 3,
    'reg_alpha': 0.00019085504732938993,
    'reg_lambda': 0.0004776998473727695,
    'random_state': RANDOM_SEED,
    'tree_method': 'hist'
}

print("XGBoost parameters:")
for k, v in xgb_params.items():
    print(f"  {k}: {v}")

In [None]:
# Create pipeline with MEstimateEncoder
model = make_pipeline(
    MEstimateEncoder(cols=cols_to_encode),
    XGBClassifier(**xgb_params)
)

print("Pipeline created:")
print(model)

In [None]:
# Cross-validation setup
n_splits = 9
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

from sklearn.base import clone

print(f"Using {n_splits}-fold Stratified CV")
print(f"Total samples: {len(X)}")
print(f"Samples per fold: ~{len(X) // n_splits}")

In [None]:
# Run cross-validation
fold_scores = []
oof_predictions = np.zeros((len(X), len(y.unique())))
class_names = sorted(y.unique())
target_mapping = {cls: idx for idx, cls in enumerate(class_names)}

print("Starting cross-validation...")
print(f"Classes: {class_names}")
print(f"Target mapping: {target_mapping}")

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Clone and fit model
    fold_model = clone(model)
    fold_model.fit(X_train, y_train)
    
    # Predict
    val_pred = fold_model.predict(X_val)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    
    print(f"  Fold accuracy: {fold_acc:.4f}")
    
    # Store OOF predictions
    val_pred_proba = fold_model.predict_proba(X_val)
    oof_predictions[val_idx] = val_pred_proba

# Calculate overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\n{'='*50}")
print(f"CV Results:")
print(f"  Mean accuracy: {cv_score:.4f} Â± {cv_std:.4f}")
print(f"  Individual folds: {[f'{score:.4f}' for score in fold_scores]}")
print(f"{'='*50}")

In [None]:
# Generate predictions on test set
print("Training final model on full training data...")
final_model = clone(model)
final_model.fit(X, y)

print("Generating test predictions...")
test_predictions = final_model.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    TARGET: test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction distribution:\n{submission[TARGET].value_counts()}")

# Save submission
submission_path = "/home/submission/submission_003.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

In [None]:
# Analyze feature importance
print("Analyzing feature importance...")

# Get the XGBoost model from the pipeline
xgb_model = final_model.named_steps['xgbclassifier']
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 features:")
print(importance_df.head(10))

# Check BMI importance
bmi_importance = importance_df[importance_df['feature'] == 'BMI']['importance'].iloc[0]
print(f"\nBMI importance: {bmi_importance:.4f}")
print(f"BMI rank: {importance_df[importance_df['feature'] == 'BMI'].index[0] + 1}")