# Experiment 004: LGBM Baseline with Enhanced Features

Implementing LGBM model with enhanced features (same as exp_001):
- WHO_BMI_Categories (71.88% standalone accuracy)
- Weight_Height_Ratio
- Lifestyle interactions (FCVC_NCP, CH2O_FAF, FAF_TUE)
- ColumnTransformer + OrdinalEncoder (leakage-free)
- 5-fold CV

This provides model diversity for ensembling and follows winning kernel's approach.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Store target and IDs
TARGET = 'NObeyesdad'
test_ids = test['id'].values

# Encode target to numeric
le = LabelEncoder()
train[TARGET] = le.fit_transform(train[TARGET])

print(f"Target classes: {le.classes_}")
print(f"Encoded target distribution:\n{train[TARGET].value_counts()}")

In [2]:
# Remove duplicate data loading - already loaded in cell 1
# Just verify the data
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:\n{train[TARGET].value_counts()}")
print(f"Target dtype: {train[TARGET].dtype}")
print(f"Target unique values: {train[TARGET].unique()}")

Train shape: (20758, 18)
Test shape: (13840, 17)
Target distribution:
NObeyesdad
Obesity_Type_III       4046
Obesity_Type_II        3248
Normal_Weight          3082
Obesity_Type_I         2910
Insufficient_Weight    2523
Overweight_Level_II    2522
Overweight_Level_I     2427
Name: count, dtype: int64


In [3]:
# Feature engineering functions (same as exp_001)
def engineer_features(df):
    """Engineer enhanced features for the model"""
    df = df.copy()
    
    # 1. BMI (critical feature)
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    
    # 2. WHO BMI Categories (71.88% standalone accuracy)
    def categorize_bmi(bmi):
        if bmi < 18.5:
            return 0  # Underweight
        elif 18.5 <= bmi < 25:
            return 1  # Normal
        elif 25 <= bmi < 30:
            return 2  # Overweight
        elif 30 <= bmi < 35:
            return 3  # Obese_I
        elif 35 <= bmi < 40:
            return 4  # Obese_II
        else:
            return 5  # Obese_III
    
    df['WHO_BMI_Categories'] = df['BMI'].apply(categorize_bmi)
    
    # 3. Weight to Height Ratio
    df['Weight_Height_Ratio'] = df['Weight'] / df['Height']
    
    # 4. Age groups (5 bins) - as integers
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 19, 30, 45, 60, 100], 
                             labels=[0, 1, 2, 3, 4])
    
    # 5. Lifestyle interactions
    df['FCVC_NCP'] = df['FCVC'] * df['NCP']  # Food consumption * meals
    df['CH2O_FAF'] = df['CH2O'] * df['FAF']  # Water * activity
    df['FAF_TUE'] = df['FAF'] * df['TUE']    # Activity * tech use
    
    # 6. Age interactions
    df['Age_Height'] = df['Age'] * df['Height']
    df['Age_Weight'] = df['Age'] * df['Weight']
    
    return df

# Apply feature engineering
train = engineer_features(train)
test = engineer_features(test)

print(f"Feature engineered train shape: {train.shape}")
print(f"Feature engineered test shape: {test.shape}")
print(f"New features: {[col for col in train.columns if col not in ['id', TARGET]]}")

Feature engineered train shape: (20758, 27)
Feature engineered test shape: (13840, 26)
New features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'BMI', 'WHO_BMI_Categories', 'Weight_Height_Ratio', 'Age_Group', 'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE', 'Age_Height', 'Age_Weight']


In [4]:
# Define categorical and numeric columns
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 
                    'SCC', 'CALC', 'MTRANS']
numeric_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
                'BMI', 'WHO_BMI_Categories', 'Weight_Height_Ratio', 'Age_Group',
                'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE', 'Age_Height', 'Age_Weight']

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ])

# Prepare data
X = train.drop([TARGET], axis=1)
y = train[TARGET]

# Fit preprocessor and transform
X_processed = preprocessor.fit_transform(X)

print(f"Processed X shape: {X_processed.shape}")
print(f"Target y shape: {y.shape}")
print(f"Unique classes in y: {y.unique()}")
print(f"Number of classes: {y.nunique()}")

# Verify no NaN values
print(f"NaN values in X_processed: {np.isnan(X_processed).sum()}")
print(f"NaN values in y: {y.isna().sum()}")

Categorical columns (9): ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'WHO_BMI_Categories']
Numerical columns (16): ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI', 'Weight_Height_Ratio', 'Age_Group', 'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE', 'Age_Height', 'Age_Weight']
Total features: 25


In [5]:
# Train LGBM model with 9-fold CV (matching winning kernel)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# 9-fold Stratified CV
cv = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)

# Store results
fold_scores = []
oof_predictions = np.zeros((len(X_processed), len(y.unique())))

print("Training LGBM with 9-fold CV...")

for fold, (train_idx, val_idx) in enumerate(cv.split(X_processed, y)):
    print(f"Fold {fold+1}/9")
    
    X_train, X_val = X_processed[train_idx], X_processed[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Parameters (matching winning kernel's approach)
    params = {
        'objective': 'multiclass',
        'num_class': len(y.unique()),
        'metric': 'multi_error',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'num_threads': -1
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict and evaluate
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred_labels = np.argmax(val_pred, axis=1)
    
    # Convert back to original labels for accuracy calculation
    val_pred_original = le.inverse_transform(val_pred_labels)
    val_true_original = le.inverse_transform(y_val)
    
    fold_score = accuracy_score(val_true_original, val_pred_original)
    fold_scores.append(fold_score)
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_pred
    
    print(f"Fold {fold+1} Accuracy: {fold_score:.4f}")

# Overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\nCV Score: {cv_score:.4f} ± {cv_std:.4f}")
print(f"Fold scores: {fold_scores}")

Preprocessing pipeline created:
ColumnTransformer(transformers=[('cat',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=-1),
                                 ['Gender', 'family_history_with_overweight',
                                  'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC',
                                  'MTRANS', 'WHO_BMI_Categories']),
                                ('num', 'passthrough',
                                 ['Age', 'Height', 'Weight', 'FCVC', 'NCP',
                                  'CH2O', 'FAF', 'TUE', 'BMI',
                                  'Weight_Height_Ratio', 'Age_Group',
                                  'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE',
                                  'Age_Height', 'Age_Weight'])])


In [6]:
# Prepare test data for prediction
test_processed = preprocessor.transform(test)

# Predict on test set
test_pred = model.predict(test_processed, num_iteration=model.best_iteration)
test_pred_labels = np.argmax(test_pred, axis=1)

# Convert to original labels
test_pred_original = le.inverse_transform(test_pred_labels)

print(f"Test predictions shape: {test_pred_original.shape}")
print(f"Unique predictions: {np.unique(test_pred_original)}")

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    TARGET: test_pred_original
})

print(f"Submission shape: {submission.shape}")
print(f"Submission head:\n{submission.head()}")

# Save submission
submission_path = '/home/submission/submission_lgbm_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Also save OOF predictions for potential ensembling
oof_df = pd.DataFrame(oof_predictions, columns=[f'pred_class_{i}' for i in range(len(y.unique()))])
oof_df['id'] = train['id'].values
oof_df[TARGET] = train[TARGET].values

oof_path = '/home/submission/oof_lgbm_baseline.csv'
oof_df.to_csv(oof_path, index=False)
print(f"OOF predictions saved to: {oof_path}")

LGBM parameters:
  random_state: 73
  verbose: -1


In [7]:
# Cross-validation setup
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

print(f"Using {n_splits}-fold Stratified CV")
print(f"Total samples: {len(X)}")
print(f"Samples per fold: ~{len(X) // n_splits}")

Using 5-fold Stratified CV
Total samples: 20758
Samples per fold: ~4151


In [None]:
# Run cross-validation
fold_scores = []

print("Starting cross-validation...")
print(f"Classes: {sorted(y.unique())}")

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Fit preprocessor and transform training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_val_processed = preprocessor.transform(X_val)
    
    # Convert to dense arrays (LGBM issue with sparse matrices in some versions)
    if hasattr(X_train_processed, 'toarray'):
        X_train_processed = X_train_processed.toarray()
        X_val_processed = X_val_processed.toarray()
    
    # Ensure we have float32 for LGBM
    X_train_processed = X_train_processed.astype(np.float32)
    X_val_processed = X_val_processed.astype(np.float32)
    
    # Train LGBM model
    model = LGBMClassifier(**lgbm_params)
    model.fit(X_train_processed, y_train)
    
    # Predict and evaluate
    val_pred = model.predict(X_val_processed)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    
    print(f"  Fold accuracy: {fold_acc:.4f}")

# Calculate overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\n{'='*50}")
print(f"CV Results:")
print(f"  Mean accuracy: {cv_score:.4f} ± {cv_std:.4f}")
print(f"  Individual folds: {[f'{score:.4f}' for score in fold_scores]}")
print(f"{'='*50}")

In [None]:
# Generate predictions on test set
print("Training final model on full training data...")

# Fit preprocessor on full data
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# Train final model
final_model = LGBMClassifier(**lgbm_params)
final_model.fit(X_processed, y)

print("Generating test predictions...")
test_predictions = final_model.predict(X_test_processed)

# Create submission
submission = pd.DataFrame({
    'id': test[ID_COL],
    TARGET: test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction distribution:\n{submission[TARGET].value_counts()}")

# Save submission
submission_path = "/home/submission/submission_004.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

In [None]:
# Analyze feature importance
print("Analyzing feature importance...")

# Get feature names after preprocessing
# OrdinalEncoder preserves column order, so we can reconstruct feature names
feature_names = numerical_cols + categorical_cols

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 features:")
print(importance_df.head(15))

# Check BMI-related features
bmi_features = [f for f in feature_names if 'BMI' in f]
if bmi_features:
    print(f"\nBMI-related features:")
    for feat in bmi_features:
        imp = importance_df[importance_df['feature'] == feat]['importance'].iloc[0]
        rank = importance_df[importance_df['feature'] == feat].index[0] + 1
        print(f"  {feat}: importance={imp:.4f}, rank={rank}")