# Evolver Loop 3 Analysis: MEstimateEncoder Investigation

This notebook analyzes the winning kernel's use of MEstimateEncoder and prepares recommendations for the next experiment.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from category_encoders import MEstimateEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train = pd.read_csv('/home/code/data/train.csv')
test = pd.read_csv('/home/code/data/test.csv')

print("Dataset shapes:")
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")
print("\nTarget distribution:")
print(train['NObeyesdad'].value_counts(normalize=True))

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from category_encoders import MEstimateEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print("Dataset shapes:")
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")
print("\nTarget distribution:")
print(train['NObeyesdad'].value_counts(normalize=True))

In [None]:
# Identify categorical columns
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 
                   'SMOKE', 'SCC', 'CALC', 'MTRANS']

# Analyze cardinality and target relationship
for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Cardinality: {train[col].nunique()}")
    print(f"  Categories: {train[col].unique()}")
    
    # Calculate target distribution per category
    target_dist = pd.crosstab(train[col], train['NObeyesdad'], normalize='index')
    print(f"  Most dominant class per category:")
    dominant = target_dist.idxmax(axis=1)
    for cat in train[col].unique():
        if pd.notna(cat):
            max_prob = target_dist.loc[cat].max()
            print(f"    {cat}: {dominant[cat]} ({max_prob:.1%})")

In [None]:
# Calculate chi-square statistics to measure feature importance
from scipy.stats import chi2_contingency

def calculate_chi_square(feature, target):
    """Calculate chi-square statistic for categorical feature vs target"""
    contingency_table = pd.crosstab(feature, target)
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    return chi2, p_value

print("Chi-square statistics for categorical features:")
chi2_results = []
for col in categorical_cols:
    chi2, p_value = calculate_chi_square(train[col], train['NObeyesdad'])
    chi2_results.append((col, chi2, p_value))
    print(f"{col:30s}: chi2={chi2:10.2f}, p_value={p_value:.2e}")

# Sort by chi-square value
chi2_results.sort(key=lambda x: x[1], reverse=True)
print("\nFeatures sorted by chi-square (most important first):")
for col, chi2, p_value in chi2_results:
    print(f"{col:30s}: {chi2:10.2f}")

In [None]:
# Test MEstimateEncoder performance vs OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# Define features
feature_cols = [col for col in train.columns if col != 'NObeyesdad']
X = train[feature_cols]
y = train['NObeyesdad']

# Test different encoding strategies
def test_encoding(encoder, encoder_name):
    """Test encoding strategy with XGBoost"""
    pipeline = Pipeline([
        ('encoder', encoder),
        ('xgb', XGBClassifier(
            max_depth=6,
            learning_rate=0.1,
            n_estimators=500,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            eval_metric='mlogloss'
        ))
    ])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    
    print(f"{encoder_name}:")
    print(f"  Mean accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    print(f"  Fold scores: {scores}")
    return scores.mean(), scores.std()

# Test OrdinalEncoder (current approach)
ordinal_encoder = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
         categorical_cols)
    ], remainder='passthrough'
)

ordinal_mean, ordinal_std = test_encoding(ordinal_encoder, "OrdinalEncoder (current)")

# Test MEstimateEncoder
mestimate_encoder = ColumnTransformer(
    transformers=[
        ('cat', MEstimateEncoder(cols=categorical_cols), 
         categorical_cols)
    ], remainder='passthrough'
)

mestimate_mean, mestimate_std = test_encoding(mestimate_encoder, "MEstimateEncoder")

print(f"\nImprovement: {mestimate_mean - ordinal_mean:.4f}")

## Key Findings

Based on this analysis and the winning kernel review:

1. **MEstimateEncoder is superior**: The winning kernel achieved 0.92160 using MEstimateEncoder vs our 0.906 with OrdinalEncoder
2. **Target encoding captures relationships**: MEstimateEncoder converts categories to target probabilities, preserving the relationship between categories and target
3. **Appropriate features**: The kernel used MEstimateEncoder for 8 categorical features with moderate cardinality (2-6 categories each)
4. **Must prevent leakage**: MEstimateEncoder must be fit within CV folds, which our ColumnTransformer approach handles correctly

## Recommendations for Next Experiment

1. **Replace OrdinalEncoder with MEstimateEncoder** for the 8 categorical features
2. **Keep enhanced features**: WHO_BMI_Categories, Weight_Height_Ratio, lifestyle interactions
3. **Test both encoders**: Run comparison to validate improvement
4. **Consider ensemble**: If both encoders work well, ensemble them for diversity