# Experiment 004: LGBM Baseline with Enhanced Features

Implementing LGBM model with enhanced features (same as exp_001):
- WHO_BMI_Categories (71.88% standalone accuracy)
- Weight_Height_Ratio
- Lifestyle interactions (FCVC_NCP, CH2O_FAF, FAF_TUE)
- ColumnTransformer + OrdinalEncoder (leakage-free)
- 5-fold CV

This provides model diversity for ensembling and follows winning kernel's approach.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from lightgbm import LGBMClassifier
import os

# Set seeds for reproducibility
RANDOM_SEED = 73
np.random.seed(RANDOM_SEED)

In [2]:
# Load data
FILE_PATH = "/home/data"
train = pd.read_csv(os.path.join(FILE_PATH, "train.csv"))
test = pd.read_csv(os.path.join(FILE_PATH, "test.csv"))
sample_sub = pd.read_csv(os.path.join(FILE_PATH, "sample_submission.csv"))

TARGET = "NObeyesdad"
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:\n{train[TARGET].value_counts()}")

Train shape: (20758, 18)
Test shape: (13840, 17)
Target distribution:
NObeyesdad
Obesity_Type_III       4046
Obesity_Type_II        3248
Normal_Weight          3082
Obesity_Type_I         2910
Insufficient_Weight    2523
Overweight_Level_II    2522
Overweight_Level_I     2427
Name: count, dtype: int64


In [3]:
# Feature engineering functions (same as exp_001)
def engineer_features(df):
    """Engineer enhanced features for the model"""
    df = df.copy()
    
    # 1. BMI (critical feature)
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    
    # 2. WHO BMI Categories (71.88% standalone accuracy)
    def categorize_bmi(bmi):
        if bmi < 18.5:
            return 'Underweight'
        elif 18.5 <= bmi < 25:
            return 'Normal'
        elif 25 <= bmi < 30:
            return 'Overweight'
        elif 30 <= bmi < 35:
            return 'Obese_I'
        elif 35 <= bmi < 40:
            return 'Obese_II'
        else:
            return 'Obese_III'
    
    df['WHO_BMI_Categories'] = df['BMI'].apply(categorize_bmi)
    
    # 3. Weight to Height Ratio
    df['Weight_Height_Ratio'] = df['Weight'] / df['Height']
    
    # 4. Age groups (5 bins)
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 19, 30, 45, 60, 100], 
                             labels=['Teen', 'Young_Adult', 'Adult', 'Middle_Age', 'Senior'])
    
    # 5. Lifestyle interactions
    df['FCVC_NCP'] = df['FCVC'] * df['NCP']  # Food consumption * meals
    df['CH2O_FAF'] = df['CH2O'] * df['FAF']  # Water * activity
    df['FAF_TUE'] = df['FAF'] * df['TUE']    # Activity * tech use
    
    # 6. Age interactions
    df['Age_Height'] = df['Age'] * df['Height']
    df['Age_Weight'] = df['Age'] * df['Weight']
    
    return df

# Apply feature engineering
train = engineer_features(train)
test = engineer_features(test)

print(f"Feature engineered train shape: {train.shape}")
print(f"Feature engineered test shape: {test.shape}")
print(f"New features: {[col for col in train.columns if col not in ['id', TARGET]]}")

Feature engineered train shape: (20758, 27)
Feature engineered test shape: (13840, 26)
New features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'BMI', 'WHO_BMI_Categories', 'Weight_Height_Ratio', 'Age_Group', 'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE', 'Age_Height', 'Age_Weight']


In [4]:
# Define feature groups
TARGET = "NObeyesdad"
ID_COL = "id"

# Separate features and target
X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]
X_test = test.drop(columns=[ID_COL])

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Total features: {len(categorical_cols) + len(numerical_cols)}")

Categorical columns (9): ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'WHO_BMI_Categories']
Numerical columns (16): ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI', 'Weight_Height_Ratio', 'Age_Group', 'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE', 'Age_Height', 'Age_Weight']
Total features: 25


In [5]:
# Create preprocessing pipeline (leakage-free)
# OrdinalEncoder with handle_unknown='use_encoded_value' prevents leakage
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

print("Preprocessing pipeline created:")
print(preprocessor)

Preprocessing pipeline created:
ColumnTransformer(transformers=[('cat',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=-1),
                                 ['Gender', 'family_history_with_overweight',
                                  'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC',
                                  'MTRANS', 'WHO_BMI_Categories']),
                                ('num', 'passthrough',
                                 ['Age', 'Height', 'Weight', 'FCVC', 'NCP',
                                  'CH2O', 'FAF', 'TUE', 'BMI',
                                  'Weight_Height_Ratio', 'Age_Group',
                                  'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE',
                                  'Age_Height', 'Age_Weight'])])


In [6]:
# LGBM parameters (starting with defaults, will tune later)
lgbm_params = {
    'random_state': RANDOM_SEED,
    'verbose': -1  # Suppress warnings
}

print("LGBM parameters:")
for k, v in lgbm_params.items():
    print(f"  {k}: {v}")

LGBM parameters:
  random_state: 73
  verbose: -1


In [7]:
# Cross-validation setup
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

print(f"Using {n_splits}-fold Stratified CV")
print(f"Total samples: {len(X)}")
print(f"Samples per fold: ~{len(X) // n_splits}")

Using 5-fold Stratified CV
Total samples: 20758
Samples per fold: ~4151


In [None]:
# Run cross-validation
fold_scores = []

print("Starting cross-validation...")
print(f"Classes: {sorted(y.unique())}")

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Fit preprocessor and transform training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_val_processed = preprocessor.transform(X_val)
    
    # Train LGBM model
    model = LGBMClassifier(**lgbm_params)
    model.fit(X_train_processed, y_train)
    
    # Predict and evaluate
    val_pred = model.predict(X_val_processed)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    
    print(f"  Fold accuracy: {fold_acc:.4f}")

# Calculate overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\n{'='*50}")
print(f"CV Results:")
print(f"  Mean accuracy: {cv_score:.4f} Â± {cv_std:.4f}")
print(f"  Individual folds: {[f'{score:.4f}' for score in fold_scores]}")
print(f"{'='*50}")

In [None]:
# Generate predictions on test set
print("Training final model on full training data...")

# Fit preprocessor on full data
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# Train final model
final_model = LGBMClassifier(**lgbm_params)
final_model.fit(X_processed, y)

print("Generating test predictions...")
test_predictions = final_model.predict(X_test_processed)

# Create submission
submission = pd.DataFrame({
    'id': test[ID_COL],
    TARGET: test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction distribution:\n{submission[TARGET].value_counts()}")

# Save submission
submission_path = "/home/submission/submission_004.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

In [None]:
# Analyze feature importance
print("Analyzing feature importance...")

# Get feature names after preprocessing
# OrdinalEncoder preserves column order, so we can reconstruct feature names
feature_names = numerical_cols + categorical_cols

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 features:")
print(importance_df.head(15))

# Check BMI-related features
bmi_features = [f for f in feature_names if 'BMI' in f]
if bmi_features:
    print(f"\nBMI-related features:")
    for feat in bmi_features:
        imp = importance_df[importance_df['feature'] == feat]['importance'].iloc[0]
        rank = importance_df[importance_df['feature'] == feat].index[0] + 1
        print(f"  {feat}: importance={imp:.4f}, rank={rank}")