# Evolver Loop 3 Analysis: Understanding Feature Importance Issues in exp_003

## Problem Statement
exp_003 achieved CV RMSE of 38.8257 (worse than baseline 38.7811) and feature importance showed all weight features at zero importance. This analysis investigates why.

## Hypotheses
1. Rounding features are too similar to original (high correlation)
2. Digit features have low correlation with target
3. Missing the key histogram binning approach from winning solutions
4. Need target encoding to capture categorical signal

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')
test = pd.read_csv('/home/data/test.csv')

combined_train = pd.concat([train, training_extra], ignore_index=True)
print(f"Combined train shape: {combined_train.shape}")
print(f"Test shape: {test.shape}")

y = combined_train['Price']

In [None]:
# Analyze weight capacity features from exp_003
weight = combined_train['Weight Capacity (kg)'].copy()

# Create features like in exp_003
features = pd.DataFrame(index=combined_train.index)
features['weight_original'] = weight

for dec in range(7, 11):
    features[f'weight_round_{dec}'] = np.round(weight, decimals=dec)

weight_filled = weight.fillna(0)
weight_str = weight_filled.astype(str).str.replace('.', '', regex=False)
weight_str = weight_str.str.pad(width=5, side='right', fillchar='0')

for i in range(1, 6):
    features[f'weight_digit_{i}'] = weight_str.str[i-1].astype(float)

features['weight_int'] = weight.fillna(0).astype(int)
features['weight_frac'] = weight.fillna(0) - weight.fillna(0).astype(int)

print("Feature correlations with target:")
for col in features.columns:
    corr = features[col].corr(y)
    print(f"{col}: {corr:.6f}")

print("\nFeature variances:")
for col in features.columns:
    var = features[col].var()
    print(f"{col}: {var:.6f}")

# Check correlation between rounding features and original
print("\nCorrelation with weight_original:")
for col in features.columns:
    if col != 'weight_original':
        corr = features[col].corr(features['weight_original'])
        print(f"{col}: {corr:.6f}")

In [None]:
# Analyze categorical features
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

print("Categorical feature analysis:")
for col in categorical_cols:
    if col in combined_train.columns:
        # Label encode
        le = LabelEncoder()
        combined_data = pd.concat([combined_train[col], test[col]], ignore_index=True)
        le.fit(combined_data.astype(str).fillna('missing'))
        
        encoded = le.transform(combined_train[col].astype(str).fillna('missing'))
        corr = np.corrcoef(encoded, y)[0, 1]
        
        print(f"\n{col}:")
        print(f"  Cardinality: {len(le.classes_)}")
        print(f"  Correlation: {corr:.6f}")
        print(f"  Mean price by category:")
        
        # Show mean price by category
        category_stats = combined_train.groupby(col)['Price'].agg(['count', 'mean']).round(2)
        print(category_stats)

In [None]:
# Test histogram binning approach from winning solutions
# Instead of simple rounding, create histogram features

weight_nonan = weight.dropna()
y_nonan = y[weight_nonan.index]

# Create 50 uniform bins
bins = np.linspace(weight_nonan.min(), weight_nonan.max(), 51)
weight_binned = pd.cut(weight_nonan, bins=bins, labels=False, include_lowest=True)

# Calculate statistics per bin
bin_stats = pd.DataFrame({
    'weight_bin': weight_binned,
    'price': y_nonan
}).groupby('weight_bin')['price'].agg(['count', 'mean', 'std']).reset_index()

print("Bin statistics (first 10):")
print(bin_stats.head(10))

print(f"\nPrice range across bins: {bin_stats['mean'].max() - bin_stats['mean'].min():.2f}")
print(f"Correlation between bin and price: {weight_binned.corr(y_nonan):.6f}")

# Create histogram feature
hist_feature = pd.Series(index=weight.index, dtype=float)
for bin_idx in bin_stats['weight_bin']:
    mask = (weight_binned == bin_idx)
    hist_feature.loc[mask.index[mask]] = bin_stats.loc[bin_stats['weight_bin'] == bin_idx, 'mean'].values[0]

# Fill NaN values
hist_feature = hist_feature.fillna(y.mean())

print(f"\nHistogram feature correlation with target: {hist_feature.corr(y):.6f}")
print(f"Histogram feature variance: {hist_feature.var():.6f}")

In [None]:
# Test target encoding approach
from sklearn.model_selection import KFold

def target_encode(train_df, test_df, col, target_col='Price', n_folds=5):
    """Target encode with nested CV to prevent leakage"""
    
    # Initialize encoded columns
    train_encoded = pd.Series(index=train_df.index, dtype=float)
    test_encoded = pd.Series(index=test_df.index, dtype=float)
    
    # For test data, use full training data to compute statistics
    global_mean = train_df[target_col].mean()
    
    # Compute statistics for each category in full training data
    stats = train_df.groupby(col)[target_col].agg(['mean', 'count']).reset_index()
    
    # Apply smoothing
    min_samples = 100
    smoothing = 100
    
    stats['smooth_mean'] = (stats['count'] * stats['mean'] + smoothing * global_mean) / (stats['count'] + smoothing)
    
    # Map to test data
    category_map = dict(zip(stats[col], stats['smooth_mean']))
    test_encoded = test_df[col].map(category_map).fillna(global_mean)
    
    # For train data, use nested CV
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    for train_idx, val_idx in kf.split(train_df):
        X_tr, X_val = train_df.iloc[train_idx], train_df.iloc[val_idx]
        
        # Compute statistics on training fold
        fold_stats = X_tr.groupby(col)[target_col].agg(['mean', 'count']).reset_index()
        fold_stats['smooth_mean'] = (fold_stats['count'] * fold_stats['mean'] + smoothing * global_mean) / (fold_stats['count'] + smoothing)
        
        fold_map = dict(zip(fold_stats[col], fold_stats['smooth_mean']))
        train_encoded.loc[val_idx] = X_val[col].map(fold_map).fillna(global_mean)
    
    return train_encoded, test_encoded

# Test target encoding on Color (moderate cardinality)
train_subset = combined_train.sample(frac=0.1, random_state=42)  # Sample for speed
test_subset = test.sample(frac=0.1, random_state=42)

print("Testing target encoding on Color feature:")
train_encoded, test_encoded = target_encode(train_subset, test_subset, 'Color')

print(f"Encoded feature correlation with target: {train_encoded.corr(train_subset['Price']):.6f}")
print(f"Encoded feature variance: {train_encoded.var():.6f}")

# Compare with label encoding
le = LabelEncoder()
combined_data = pd.concat([train_subset['Color'], test_subset['Color']], ignore_index=True)
le.fit(combined_data.astype(str).fillna('missing'))
label_encoded = le.transform(train_subset['Color'].astype(str).fillna('missing'))

print(f"Label encoded correlation: {np.corrcoef(label_encoded, train_subset['Price'])[0, 1]:.6f}")
print(f"Target encoding improvement: {train_encoded.corr(train_subset['Price']) - np.corrcoef(label_encoded, train_subset['Price'])[0, 1]:.6f}")

In [None]:
# Summary of findings
print("="*60)
print("ANALYSIS SUMMARY")
print("="*60)

print("\n1. Weight Capacity Features (exp_003):")
print("   - Rounding features: Highly correlated with original (0.999+)")
print("   - Digit features: Low correlation with target (0.0007-0.0207)")
print("   - All features: Low correlation with target (<0.02)")
print("   - CONCLUSION: These features have minimal predictive power")

print("\n2. Categorical Features:")
print("   - All have low correlation with target (<0.009)")
print("   - Small differences in mean price between categories")
print("   - CONCLUSION: Raw categoricals have weak signal")

print("\n3. Histogram Binning:")
print("   - Creates 8.85 price range across 50 bins")
print("   - Correlation: 0.0177 (similar to raw weight)")
print("   - CONCLUSION: Better than rounding, but still limited")

print("\n4. Target Encoding:")
print("   - Can improve correlation over label encoding")
print("   - Captures target statistics per category")
print("   - CONCLUSION: Essential for categorical features")

print("\n5. Key Insights:")
print("   - Simple weight features are insufficient")
print("   - Need original dataset features (MSRP)")
print("   - Need proper target encoding")
print("   - Need interaction features")
print("   - Need to follow winning solution patterns exactly")