# Enhanced Features Experiment

Addressing data leakage and adding high-impact features from the analysis.

**Priority improvements:**
1. Fix data leakage with ColumnTransformer + OrdinalEncoder
2. Add WHO_BMI_Categories (71.88% accuracy standalone)
3. Add Weight_Height_Ratio (correlation 0.4543)
4. Add lifestyle interactions (FCVC_NCP, CH2O_FAF, FAF_TUE)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# Enhanced feature engineering
def engineer_features(df):
    df = df.copy()
    
    # BMI calculation
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    
    # Weight/Height ratio (second most important feature)
    df['Weight_Height_Ratio'] = df['Weight'] / df['Height']
    
    # WHO BMI Categories (highest impact - 71.88% accuracy standalone)
    def categorize_bmi(bmi):
        if bmi < 18.5:
            return 'Underweight'
        elif bmi < 25:
            return 'Normal'
        elif bmi < 30:
            return 'Overweight'
        elif bmi < 35:
            return 'Obese_I'
        elif bmi < 40:
            return 'Obese_II'
        else:
            return 'Obese_III'
    
    df['WHO_BMI_Category'] = df['BMI'].apply(categorize_bmi)
    
    # Age groups
    df['Age_Group'] = pd.cut(df['Age'], 
                            bins=[0, 18, 30, 45, 60, 100], 
                            labels=['0-18', '19-30', '31-45', '46-60', '60+'])
    
    # Lifestyle interactions (medium-high impact)
    df['FCVC_NCP'] = df['FCVC'] * df['NCP']  # Food consumption frequency × meals
    df['CH2O_FAF'] = df['CH2O'] * df['FAF']  # Water consumption × physical activity
    df['FAF_TUE'] = df['FAF'] * df['TUE']    # Physical activity × technology use
    
    # Simple interactions
    df['Age_Height'] = df['Age'] * df['Height']
    df['Age_Weight'] = df['Age'] * df['Weight']
    
    return df

train_fe = engineer_features(train)
test_fe = engineer_features(test)

print("Engineered features:")
print(f"- BMI: {train_fe['BMI'].describe()}")
print(f"- WHO_BMI_Category distribution:\n{train_fe['WHO_BMI_Category'].value_counts()}")
print(f"- Weight_Height_Ratio: {train_fe['Weight_Height_Ratio'].describe()}")

In [None]:
# Prepare features - FIX DATA LEAKAGE with ColumnTransformer
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS', 'Age_Group', 'WHO_BMI_Category']
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 
                     'BMI', 'Weight_Height_Ratio', 'Age_Height', 'Age_Weight',
                     'FCVC_NCP', 'CH2O_FAF', 'FAF_TUE']

feature_names = numerical_features + categorical_features

print(f"Total features: {len(feature_names)}")
print(f"Numerical: {len(numerical_features)}")
print(f"Categorical: {len(categorical_features)}")

# Target encoding
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(train_fe['NObeyesdad'])
class_names = le_target.classes_
print(f"Classes: {class_names}")

In [None]:
# Stratified 5-fold CV with proper encoding (no leakage)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros((len(train_fe), len(class_names)))
test_predictions = np.zeros((len(test_fe), len(class_names)))

for fold, (train_idx, val_idx) in enumerate(skf.split(train_fe, y_encoded)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train_df = train_fe.iloc[train_idx]
    X_val_df = train_fe.iloc[val_idx]
    y_train = y_encoded[train_idx]
    y_val = y_encoded[val_idx]
    
    # Create ColumnTransformer - FIT ONLY ON TRAINING DATA (no leakage!)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features)
        ]
    )
    
    # Fit preprocessor on training data only
    X_train_processed = preprocessor.fit_transform(X_train_df)
    X_val_processed = preprocessor.transform(X_val_df)
    X_test_processed = preprocessor.transform(test_fe)
    
    # Create XGBoost datasets
    dtrain = xgb.DMatrix(X_train_processed, label=y_train)
    dval = xgb.DMatrix(X_val_processed, label=y_val)
    
    # Parameters (slightly tuned for more features)
    params = {
        'objective': 'multi:softprob',
        'num_class': len(class_names),
        'eval_metric': 'mlogloss',
        'tree_method': 'hist',
        'device': 'cuda',
        'max_depth': 7,  # Slightly deeper for more features
        'learning_rate': 0.08,  # Slightly lower for more iterations
        'subsample': 0.85,
        'colsample_bytree': 0.85,
        'random_state': 42
    }
    
    # Train
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1500,  # More rounds for better convergence
        evals=[(dval, 'val')],
        early_stopping_rounds=75,
        verbose_eval=False
    )
    
    # Predict
    val_pred = model.predict(dval)
    test_pred = model.predict(xgb.DMatrix(X_test_processed))
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / 5
    
    # Calculate accuracy
    val_pred_labels = np.argmax(val_pred, axis=1)
    fold_accuracy = accuracy_score(y_val, val_pred_labels)
    fold_scores.append(fold_accuracy)
    
    print(f"Fold {fold + 1} Accuracy: {fold_accuracy:.4f}")
    print(f"Best iteration: {model.best_iteration}")

# Overall CV score
oof_pred_labels = np.argmax(oof_predictions, axis=1)
cv_accuracy = accuracy_score(y_encoded, oof_pred_labels)
print(f"\n{'='*50}")
print(f"CV Accuracy: {cv_accuracy:.4f} ± {np.std(fold_scores):.4f}")
print(f"Individual folds: {fold_scores}")
print(f"{'='*50}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'NObeyesdad': le_target.inverse_transform(np.argmax(test_predictions, axis=1))
})

submission.to_csv('/home/submission/submission_002_enhanced_features.csv', index=False)
print(f"Submission saved. Shape: {submission.shape}")
print(f"Submission distribution:\n{submission['NObeyesdad'].value_counts(normalize=True)}")