# Baseline XGBoost Model

Simple baseline following the seed prompt strategy with basic feature engineering.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:\n{train['NObeyesdad'].value_counts(normalize=True)}")

Train shape: (20758, 18)
Test shape: (13840, 17)
Target distribution:
NObeyesdad
Obesity_Type_III       0.194913
Obesity_Type_II        0.156470
Normal_Weight          0.148473
Obesity_Type_I         0.140187
Insufficient_Weight    0.121544
Overweight_Level_II    0.121495
Overweight_Level_I     0.116919
Name: proportion, dtype: float64


In [2]:
# Basic feature engineering
def engineer_features(df):
    df = df.copy()
    
    # BMI calculation - critical feature
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    
    # Age groups
    df['Age_Group'] = pd.cut(df['Age'], 
                            bins=[0, 18, 30, 45, 60, 100], 
                            labels=['0-18', '19-30', '31-45', '46-60', '60+'])
    
    # Simple interactions
    df['Age_Height'] = df['Age'] * df['Height']
    df['Age_Weight'] = df['Age'] * df['Weight']
    
    return df

train_fe = engineer_features(train)
test_fe = engineer_features(test)

In [3]:
# Prepare features
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS', 'Age_Group']
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 
                     'BMI', 'Age_Height', 'Age_Weight']

# Encode categorical features
encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    # Fit on combined data to handle any unseen categories
    combined = pd.concat([train_fe[col], test_fe[col]], axis=0).astype(str)
    le.fit(combined)
    train_fe[col] = le.transform(train_fe[col].astype(str))
    test_fe[col] = le.transform(test_fe[col].astype(str))
    encoders[col] = le

X = train_fe[numerical_features + categorical_features]
y = train_fe['NObeyesdad']
X_test = test_fe[numerical_features + categorical_features]

print(f"Feature matrix shape: {X.shape}")
print(f"Test matrix shape: {X_test.shape}")

Feature matrix shape: (20758, 20)
Test matrix shape: (13840, 20)


In [4]:
# Stratified 5-fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros((len(X), len(train['NObeyesdad'].unique())))
test_predictions = np.zeros((len(X_test), len(train['NObeyesdad'].unique())))

# Get class labels
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
class_names = le_target.classes_

print(f"Classes: {class_names}")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\nFold {fold + 1}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # Create XGBoost datasets
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Parameters
    params = {
        'objective': 'multi:softprob',
        'num_class': len(class_names),
        'eval_metric': 'mlogloss',
        'tree_method': 'hist',
        'device': 'cuda',
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42
    }
    
    # Train
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dval, 'val')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Predict
    val_pred = model.predict(dval)
    test_pred = model.predict(xgb.DMatrix(X_test))
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / 5
    
    # Calculate accuracy
    val_pred_labels = np.argmax(val_pred, axis=1)
    fold_accuracy = accuracy_score(y_val, val_pred_labels)
    fold_scores.append(fold_accuracy)
    
    print(f"Fold {fold + 1} Accuracy: {fold_accuracy:.4f}")

# Overall CV score
oof_pred_labels = np.argmax(oof_predictions, axis=1)
cv_accuracy = accuracy_score(y_encoded, oof_pred_labels)
print(f"\nCV Accuracy: {cv_accuracy:.4f} ± {np.std(fold_scores):.4f}")
print(f"Individual folds: {fold_scores}")

Classes: ['Insufficient_Weight' 'Normal_Weight' 'Obesity_Type_I' 'Obesity_Type_II'
 'Obesity_Type_III' 'Overweight_Level_I' 'Overweight_Level_II']

Fold 1


Fold 1 Accuracy: 0.9109

Fold 2


Fold 2 Accuracy: 0.9037

Fold 3


Fold 3 Accuracy: 0.9121

Fold 4


Fold 4 Accuracy: 0.9034

Fold 5


Fold 5 Accuracy: 0.9056

CV Accuracy: 0.9071 ± 0.0037
Individual folds: [0.9108863198458574, 0.9036608863198459, 0.9120905587668593, 0.9033967718622019, 0.9055649241146712]


In [5]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'NObeyesdad': le_target.inverse_transform(np.argmax(test_predictions, axis=1))
})

submission.to_csv('/home/submission/submission_001_baseline_xgboost.csv', index=False)
print(f"Submission saved. Shape: {submission.shape}")
print(f"Submission distribution:\n{submission['NObeyesdad'].value_counts(normalize=True)}")

Submission saved. Shape: (13840, 2)
Submission distribution:
NObeyesdad
Obesity_Type_III       0.189668
Normal_Weight          0.153324
Obesity_Type_II        0.152890
Obesity_Type_I         0.149566
Overweight_Level_II    0.126445
Insufficient_Weight    0.123916
Overweight_Level_I     0.104191
Name: proportion, dtype: float64
