# Evolver Loop 3 Analysis: Understanding Feature Importance Issues in exp_003

## Problem Statement
exp_003 achieved CV RMSE of 38.8257 (worse than baseline 38.7811) and feature importance showed all weight features at zero importance. This analysis investigates why.

## Hypotheses
1. Rounding features are too similar to original (high correlation)
2. Digit features have low correlation with target
3. Missing the key histogram binning approach from winning solutions
4. Need target encoding to capture categorical signal

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')
test = pd.read_csv('/home/data/test.csv')

combined_train = pd.concat([train, training_extra], ignore_index=True)
print(f"Combined train shape: {combined_train.shape}")
print(f"Test shape: {test.shape}")

y = combined_train['Price']

In [None]:
# Analyze weight capacity features from exp_003
weight = combined_train['Weight Capacity (kg)'].copy()

# Create features like in exp_003
features = pd.DataFrame(index=combined_train.index)
features['weight_original'] = weight

for dec in range(7, 11):
    features[f'weight_round_{dec}'] = np.round(weight, decimals=dec)

weight_filled = weight.fillna(0)
weight_str = weight_filled.astype(str).str.replace('.', '', regex=False)
weight_str = weight_str.str.pad(width=5, side='right', fillchar='0')

for i in range(1, 6):
    features[f'weight_digit_{i}'] = weight_str.str[i-1].astype(float)

features['weight_int'] = weight.fillna(0).astype(int)
features['weight_frac'] = weight.fillna(0) - weight.fillna(0).astype(int)

print("Feature correlations with target:")
for col in features.columns:
    corr = features[col].corr(y)
    print(f"{col}: {corr:.6f}")

print("\nFeature variances:")
for col in features.columns:
    var = features[col].var()
    print(f"{col}: {var:.6f}")

# Check correlation between rounding features and original
print("\nCorrelation with weight_original:")
for col in features.columns:
    if col != 'weight_original':
        corr = features[col].corr(features['weight_original'])
        print(f"{col}: {corr:.6f}")