# Loop 3 Analysis: Diagnosing Target Encoding Failure

Investigate WHY target encoding + product features degraded performance from 0.020 to 0.211.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

print("Loading data and OOF predictions...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

# Load OOF predictions
oof_baseline = pd.read_csv('/home/submission/oof_predictions.csv')
oof_target_enc = pd.read_csv('/home/code/experiments/oof_003_xgb_simple.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Baseline OOF shape: {oof_baseline.shape}")
print(f"Target encoding OOF shape: {oof_target_enc.shape}")

In [None]:
# Calculate fold-wise scores for both models
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

baseline_scores = []
target_enc_scores = []

fold = 1
for train_idx, val_idx in kf.split(train_df):
    y_val = train_df.iloc[val_idx]['Calories'].values
    
    # Baseline predictions - column is 'Calories' not 'oof_prediction'
    pred_baseline = oof_baseline.iloc[val_idx]['Calories'].values
    score_baseline = np.sqrt(mean_squared_log_error(y_val, np.clip(pred_baseline, 0, None)))
    baseline_scores.append(score_baseline)
    
    # Target encoding predictions
    pred_target_enc = oof_target_enc.iloc[val_idx]['oof_prediction'].values
    score_target_enc = np.sqrt(mean_squared_log_error(y_val, np.clip(pred_target_enc, 0, None)))
    target_enc_scores.append(score_target_enc)
    
    print(f"Fold {fold}: Baseline={score_baseline:.6f}, TargetEnc={score_target_enc:.6f} (diff: {score_target_enc-score_baseline:+.6f})")
    fold += 1

print(f"\nMean Baseline: {np.mean(baseline_scores):.6f} ± {np.std(baseline_scores):.6f}")
print(f"Mean TargetEnc: {np.mean(target_enc_scores):.6f} ± {np.std(target_enc_scores):.6f}")
print(f"\nDegradation: {np.mean(target_enc_scores) - np.mean(baseline_scores):+.6f}")

In [None]:
# Analyze prediction differences
y_true = train_df['Calories'].values
pred_baseline = oof_baseline['oof_prediction'].values
pred_target_enc = oof_target_enc['oof_prediction'].values

# Calculate absolute errors
error_baseline = np.abs(pred_baseline - y_true)
error_target_enc = np.abs(pred_target_enc - y_true)

# Where does target encoding perform worse?
worse_mask = error_target_enc > error_baseline
better_mask = error_target_enc < error_baseline

print(f"Samples where target encoding is WORSE: {worse_mask.sum()} ({worse_mask.mean()*100:.1f}%)")
print(f"Samples where target encoding is BETTER: {better_mask.sum()} ({better_mask.mean()*100:.1f}%)")
print(f"Samples with equal error: {(error_target_enc == error_baseline).sum()}")

# Analyze by target value
print("\n=== Analysis by Target Value ===")
print(f"Mean target (worse samples): {y_true[worse_mask].mean():.2f}")
print(f"Mean target (better samples): {y_true[better_mask].mean():.2f}")
print(f"Mean target (all samples): {y_true.mean():.2f}")

# Analyze by Sex
print("\n=== Analysis by Sex ===")
for sex in ['male', 'female']:
    mask = train_df['Sex'] == sex
    worse_pct = (worse_mask & mask).sum() / mask.sum() * 100
    print(f"{sex}: {worse_pct:.1f}% of samples are worse with target encoding")

In [None]:
# Load feature importance if available (we need to re-run model with feature importance)
print("Note: Need to re-run model with feature importance to analyze which features are being used.")
print("\nHypotheses for performance degradation:")
print("1. Target encoding on 'Sex' is overfitting (only 2 categories)")
print("2. Product features are adding noise")
print("3. Too many features causing overfitting with current hyperparameters")
print("4. Smoothing parameter (p_smooth=20) is inappropriate")
print("5. Binned features + target encoding + products = too much complexity")

In [None]:
# Check correlation between engineered features and target
print("Loading engineered features from the target encoding experiment...")

# We need to recreate the features to check correlations
num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

# Add features function from the experiment
def add_features(df, num_cols):
    df_new = df.copy()
    # Log transforms
    for col in num_cols:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    # Products
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    df_new['Duration_Heart_Rate'] = df_new['Duration'] * df_new['Heart_Rate']
    df_new['Height_Weight'] = df_new['Height'] * df_new['Weight']
    # Ratios
    df_new['Weight_Height'] = df_new['Weight'] / (df_new['Height'] + 1e-6)
    return df_new

# Add target encoding function
def get_target_encoding(X_train, y_train, X_val, X_test, col='Sex', smoothing=20):
    global_mean = y_train.mean()
    stats = pd.DataFrame({
        'target': y_train,
        'category': X_train[col]
    }).groupby('category')['target'].agg(['count', 'mean'])
    
    def encode(series):
        result = []
        for val in series:
            if val in stats.index:
                count = stats.loc[val, 'count']
                mean = stats.loc[val, 'mean']
                smoothed = (count * mean + smoothing * global_mean) / (count + smoothing)
                result.append(smoothed)
            else:
                result.append(global_mean)
        return np.array(result)
    
    X_train_enc = X_train.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    X_train_enc[f'{col}_target_enc'] = encode(X_train[col])
    X_val_enc[f'{col}_target_enc'] = encode(X_val[col])
    X_test_enc[f'{col}_target_enc'] = encode(X_test[col])
    
    return X_train_enc, X_val_enc, X_test_enc

# Create features on full training data
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
train_idx, val_idx = list(kf.split(train_df))[0]  # Use first fold

X_tr, X_va = train_df.iloc[train_idx].copy(), train_df.iloc[val_idx].copy()
y_tr, y_va = X_tr['Calories'].values, X_va['Calories'].values

X_tr = X_tr.drop('Calories', axis=1)
X_va = X_va.drop('Calories', axis=1)

# Add features
X_tr = add_features(X_tr, num_features)
X_va = add_features(X_va, num_features)

# Target encoding
X_tr_enc, X_va_enc, _ = get_target_encoding(X_tr, y_tr, X_va, test_df.copy())

# Check correlations
print("\n=== Correlation Analysis ===")
feature_cols = [c for c in X_tr_enc.columns if c not in ['id', 'Sex']]
correlations = {}
for col in feature_cols:
    corr = np.corrcoef(X_tr_enc[col], y_tr)[0, 1]
    correlations[col] = corr

# Sort by absolute correlation
sorted_corr = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

print("Top correlations with target:")
for col, corr in sorted_corr[:15]:
    print(f"  {col}: {corr:.4f}")

print("\nTarget encoding correlation:")
if 'Sex_target_enc' in correlations:
    print(f"  Sex_target_enc: {correlations['Sex_target_enc']:.4f}")

# Check if target encoding is highly correlated with other features
print("\n=== Feature Redundancy Check ===")
if 'Sex_target_enc' in X_tr_enc.columns:
    sex_enc_corr = {}
    for col in feature_cols:
        if col != 'Sex_target_enc':
            corr = np.corrcoef(X_tr_enc['Sex_target_enc'], X_tr_enc[col])[0, 1]
            if abs(corr) > 0.5:
                sex_enc_corr[col] = corr
    
    if sex_enc_corr:
        print("Features highly correlated with Sex_target_enc:")
        for col, corr in sex_enc_corr.items():
            print(f"  {col}: {corr:.4f}")
    else:
        print("No features highly correlated with Sex_target_enc (|corr| > 0.5)")

In [None]:
# Summary of findings
print("="*60)
print("DIAGNOSTIC SUMMARY")
print("="*60)
print("\n1. PERFORMANCE DEGRADATION:")
print(f"   Baseline XGBoost: 0.02047")
print(f"   With target encoding + products: 0.21156")
print(f"   Degradation: +0.19109 (+933% worse)")

print("\n2. PREDICTION ANALYSIS:")
print(f"   Target encoding is WORSE on {worse_mask.sum()} samples ({worse_mask.mean()*100:.1f}%)")
print(f"   Target encoding is BETTER on {better_mask.sum()} samples ({better_mask.mean()*100:.1f}%)")

print("\n3. KEY HYPOTHESES:")
print("   a) 'Sex' has only 2 categories - target encoding may overfit")
print("   b) Product features may add noise rather than signal")
print("   c) Too many features (23) with same hyperparameters causes overfitting")
print("   d) Smoothing=20 may be inappropriate for this dataset")
print("   e) Manual target encoding may have implementation issues")

print("\n4. RECOMMENDED NEXT STEPS:")
print("   - Run ablation study: test target encoding only, products only, both")
print("   - Try sklearn's TargetEncoder with internal cross-fitting")
print("   - Test different smoothing parameters (5, 10, 50, 100)")
print("   - Increase regularization (reg_alpha, reg_lambda, reduce depth)")
print("   - Remove binned features (may be redundant with target encoding)")
print("   - Check for data leakage in target encoding implementation")

In [None]:
# Save findings
findings = {
    'baseline_score': np.mean(baseline_scores),
    'target_enc_score': np.mean(target_enc_scores),
    'degradation': np.mean(target_enc_scores) - np.mean(baseline_scores),
    'worse_samples_pct': worse_mask.mean() * 100,
    'better_samples_pct': better_mask.mean() * 100,
    'hypotheses': [
        'Sex has only 2 categories - target encoding may overfit',
        'Product features may add noise rather than signal',
        'Too many features (23) with same hyperparameters causes overfitting',
        'Smoothing=20 may be inappropriate',
        'Manual target encoding implementation may have issues'
    ]
}

print("Findings saved to analysis notebook.")
print("\nNext: Run ablation studies and test alternative implementations.")