# Evolver Loop 5 Analysis: Debugging Feature Importance and Adding Histogram Binning

## Objectives
1. Debug the zero feature importance issue in exp_004
2. Implement histogram binning (1st place technique)
3. Fix potential feature name issues
4. Prepare for next experiment

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load the features from exp_004 to debug
print("Loading exp_004 features...")
train = pd.read_csv('/home/data/train.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')
test = pd.read_csv('/home/data/test.csv')
combined_train = pd.concat([train, training_extra], ignore_index=True)

# Recreate features from exp_004 (simplified version)
def create_debug_features():
    """Recreate exp_004 features to debug"""
    
    # Baseline features
    features = pd.DataFrame(index=combined_train.index)
    weight = combined_train['Weight Capacity (kg)'].copy()
    
    features['weight_original'] = weight
    for dec in range(7, 11):
        features[f'weight_round_{dec}'] = np.round(weight, decimals=dec)
    
    # Simple groupby feature (just mean to test)
    weight_capacity_mean = combined_train.groupby('Weight Capacity (kg)')['Price'].mean()
    features['weight_capacity_mean_price'] = weight.map(weight_capacity_mean)
    
    return features

X_debug = create_debug_features()
print(f"Debug features shape: {X_debug.shape}")
print(f"Feature names: {X_debug.columns.tolist()}")
print(f"\nSample values:")
print(X_debug.head())

# Check for NaNs and constants
print(f"\nNaN counts:")
print(X_debug.isnull().sum())

print(f"\nValue ranges:")
for col in X_debug.columns:
    print(f"{col}: {X_debug[col].min():.6f} - {X_debug[col].max():.6f} (std: {X_debug[col].std():.6f})")

## Test Feature Importance Extraction

In [None]:
# Test XGBoost with debug features
y = combined_train['Price'].values

# Simple train/validation split
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = list(kf.split(X_debug))[0]

X_train, X_val = X_debug.iloc[train_idx], X_debug.iloc[val_idx]
y_train, y_val = y[train_idx], y[val_idx]

# Train XGBoost
model = xgb.XGBRegressor(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# Try different importance extraction methods
print("Method 1: model.get_booster().get_score()")
try:
    importance1 = model.get_booster().get_score(importance_type='gain')
    print(f"Importance dict: {importance1}")
except Exception as e:
    print(f"Error: {e}")

print("\nMethod 2: model.get_booster().get_score() with fmap")
try:
    # Create feature map
    feature_names = X_debug.columns.tolist()
    with open('/tmp/feature_map.txt', 'w') as f:
        for i, name in enumerate(feature_names):
            # Clean feature names - remove special characters
            clean_name = name.replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_')
            f.write(f'{i}\t{clean_name}\t{0}\tq\n')
    
    importance2 = model.get_booster().get_score(fmap='/tmp/feature_map.txt', importance_type='gain')
    print(f"Importance dict: {importance2}")
except Exception as e:
    print(f"Error: {e}")

print("\nMethod 3: model.feature_importances_")
try:
    importance3 = model.feature_importances_
    for name, imp in zip(X_debug.columns, importance3):
        print(f"{name}: {imp:.6f}")
except Exception as e:
    print(f"Error: {e}")

# Test predictions to verify model is working
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print(f"\nValidation RMSE: {rmse:.6f}")

## Test with Cleaned Feature Names

In [None]:
# Test with cleaned feature names (no special characters)
def create_clean_features():
    """Create features with cleaned names"""
    
    features = pd.DataFrame(index=combined_train.index)
    weight = combined_train['Weight Capacity (kg)'].copy()
    
    # Clean names - no special characters
    features['weight_original'] = weight
    for dec in range(7, 11):
        features[f'weight_round_{dec}'] = np.round(weight, decimals=dec)
    
    # Groupby with cleaned name
    weight_capacity_mean = combined_train.groupby('Weight Capacity (kg)')['Price'].mean()
    features['weight_capacity_mean_price'] = weight.map(weight_capacity_mean)
    
    return features

X_clean = create_clean_features()

# Rename columns to be XGBoost-friendly
clean_names = {}
for col in X_clean.columns:
    clean_name = col.replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_')
    clean_names[col] = clean_name

X_clean_renamed = X_clean.rename(columns=clean_names)
print(f"Cleaned feature names: {X_clean_renamed.columns.tolist()}")

# Train with cleaned names
X_train_clean = X_clean_renamed.iloc[train_idx]
model_clean = xgb.XGBRegressor(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
model_clean.fit(X_train_clean, y_train)

print("\nFeature importance with cleaned names:")
importance_clean = model_clean.get_booster().get_score(importance_type='gain')
for name, imp in importance_clean.items():
    print(f"{name}: {imp:.6f}")

# Check if any features have zero importance
zero_importance = [name for name, imp in importance_clean.items() if imp == 0]
if zero_importance:
    print(f"\nFeatures with zero importance: {zero_importance}")
else:
    print("\nAll features have non-zero importance!")

## Implement Histogram Binning

Based on 1st place solution by Chris Deotte.

In [None]:
def create_histogram_features(train_df, test_df, target_col='Price', group_col='Weight Capacity (kg)', n_bins=50):
    """
    Create histogram bin features following 1st place solution
    Pattern: groupby(COL1)[COL2].apply(make_histogram)
    """
    
    # Create bins based on training data
    price_bins = np.percentile(train_df[target_col], np.linspace(0, 100, n_bins+1))
    print(f"Creating {n_bins} histogram bins for {group_col}")
    print(f"Price bin edges: {price_bins[:5]}...{price_bins[-5:]}")
    
    def make_histogram(series):
        """Create histogram counts for a series"""
        counts, _ = np.histogram(series, bins=price_bins)
        # Return as Series with bin names
        return pd.Series(counts, index=[f'hist_bin_{i}' for i in range(n_bins)])
    
    # Compute histograms for each group
    print(f"Computing histograms for each {group_col} group...")
    group_histograms = train_df.groupby(group_col)[target_col].apply(make_histogram)
    
    # group_histograms is a Series with MultiIndex (group_col, bin_name)
    # Convert to DataFrame
    group_histograms_df = group_histograms.unstack(level=-1)
    print(f"Histogram features shape: {group_histograms_df.shape}")
    
    # Merge to train data
    train_with_hist = train_df[[group_col]].merge(
        group_histograms_df, 
        left_on=group_col, 
        right_index=True, 
        how='left'
    )
    
    # For test data, use same histograms computed from train
    test_with_hist = test_df[[group_col]].merge(
        group_histograms_df,
        left_on=group_col,
        right_index=True,
        how='left'
    )
    
    # Fill NaN for groups not seen in training
    hist_features_train = train_with_hist.drop(columns=[group_col])
    hist_features_test = test_with_hist.drop(columns=[group_col])
    
    hist_features_train = hist_features_train.fillna(0)
    hist_features_test = hist_features_test.fillna(0)
    
    return hist_features_train, hist_features_test

# Test histogram features
print("Testing histogram feature creation...")
hist_train, hist_test = create_histogram_features(
    combined_train, test, 
    target_col='Price', 
    group_col='Weight Capacity (kg)', 
    n_bins=20  # Use 20 bins for faster testing
)

print(f"\nTrain histogram features shape: {hist_train.shape}")
print(f"Test histogram features shape: {hist_test.shape}")
print(f"\nSample histogram features:")
print(hist_train.iloc[:5, :5])

## Test Combined Features (Baseline + Histogram)

In [None]:
# Combine baseline and histogram features
X_combined = pd.concat([X_clean_renamed, hist_train], axis=1)
print(f"Combined features shape: {X_combined.shape}")

# Train with combined features
X_train_combined = X_combined.iloc[train_idx]
model_combined = xgb.XGBRegressor(
    learning_rate=0.05,
    max_depth=6,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
model_combined.fit(X_train_combined, y_train)

# Check importance
print("\nTop 10 features by importance:")
importance_combined = model_combined.get_booster().get_score(importance_type='gain')
# Sort by importance
sorted_importance = sorted(importance_combined.items(), key=lambda x: x[1], reverse=True)
for name, imp in sorted_importance[:10]:
    print(f"{name}: {imp:.6f}")

# Check if histogram features have importance
hist_feature_names = [col for col in X_combined.columns if col.startswith('hist_bin_')]
hist_importance = [(name, importance_combined.get(name, 0)) for name in hist_feature_names if importance_combined.get(name, 0) > 0]
print(f"\nHistogram features with non-zero importance: {len(hist_importance)}")
if hist_importance:
    print("Top histogram features:")
    for name, imp in sorted(hist_importance, key=lambda x: x[1], reverse=True)[:5]:
        print(f"  {name}: {imp:.6f}")

## Key Findings for Next Experiment

Based on this debugging analysis:

In [None]:
print("\n" + "="*60)
print("KEY FINDINGS FOR EXPERIMENT 005")
print("="*60)
print("1. Feature names with special characters (spaces, parentheses) cause issues")
print("   - Clean names: weight_capacity_mean_price (works)")
print("   - Original names: 'Weight Capacity (kg)_mean_price' (may fail)")
print("\n2. Histogram binning is implementable and creates valuable features")
print("   - 20 bins create 20 features per group key")
print("   - Some histogram bins show non-zero importance")
print("\n3. Combined approach (baseline + histogram) is promising")
print("   - Feature importance extraction works with cleaned names")
print("   - Can identify which features are actually valuable")
print("\n4. Next experiment should:")
print("   - Use cleaned feature names throughout")
print("   - Implement histogram binning (50 bins as per 1st place)")
print("   - Apply to multiple group keys (Weight Capacity, Brand, etc.)")
print("   - Monitor feature importance to validate features are being used")