# Evolver Loop 2: Deep Dive into Feature Engineering Opportunities

Based on evaluator feedback and initial analysis, this notebook explores:
1. Advanced Weight Capacity feature engineering (histogram binning, quantile bins, interactions)
2. Target encoding strategies for categorical features
3. Interaction feature creation and validation
4. Count encoding and other proven techniques from winning solutions

Goal: Identify specific feature engineering approaches to close the 0.165 RMSE gap to target.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import KBinsDiscretizer
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
train_extra = pd.read_csv('/home/data/training_extra.csv')

# Combine train and extra as done in baseline
combined_train = pd.concat([train, train_extra], ignore_index=True)
print(f"Combined training data: {combined_train.shape}")
print(f"Test data: {test.shape}")

# Basic info
print("\nTarget statistics:")
print(combined_train['Price'].describe())

Combined training data: (3994318, 11)
Test data: (200000, 10)

Target statistics:
count    3.994318e+06
mean     8.136217e+01
std      3.893868e+01
min      1.500000e+01
25%      4.747002e+01
50%      8.098495e+01
75%      1.148550e+02
max      1.500000e+02
Name: Price, dtype: float64


In [5]:
# Analyze Weight Capacity patterns in detail
print("="*60)
print("WEIGHT CAPACITY DEEP DIVE")
print("="*60)

weight_col = 'Weight Capacity (kg)'
print(f"\nBasic stats:")
print(combined_train[weight_col].describe())

# Check for missing values
missing_count = combined_train[weight_col].isna().sum()
print(f"\nMissing values: {missing_count:,} ({missing_count/len(combined_train)*100:.2f}%)")

# Check distribution (excluding missing)
weight_non_missing = combined_train[weight_col].dropna()
print(f"\nDistribution (non-missing):")
print(f"  Zeros: {(weight_non_missing == 0).sum():,} ({(weight_non_missing == 0).mean()*100:.2f}%)")
print(f"  Negative: {(weight_non_missing < 0).sum():,} ({(weight_non_missing < 0).mean()*100:.2f}%)")
print(f"  Positive: {(weight_non_missing > 0).sum():,} ({(weight_non_missing > 0).mean()*100:.2f}%)")

# Look at relationship with target (using rows with non-missing weight)
valid_rows = combined_train[weight_col].notna()
print(f"\nCorrelation with Price (valid rows): {combined_train.loc[valid_rows, weight_col].corr(combined_train.loc[valid_rows, 'Price']):.4f}")

# Check if there are patterns in the decimal places
combined_train['weight_decimal'] = (combined_train[weight_col] % 1).round(10)
decimal_stats = combined_train.groupby('weight_decimal')['Price'].agg(['mean', 'std', 'count']).sort_values('count', ascending=False)
print(f"\nTop 10 most common decimal values:")
print(decimal_stats.head(10))

WEIGHT CAPACITY DEEP DIVE

Basic stats:


count    3.992510e+06
mean     1.801042e+01
std      6.973969e+00
min      5.000000e+00
25%      1.206896e+01
50%      1.805436e+01
75%      2.398751e+01
max      3.000000e+01
Name: Weight Capacity (kg), dtype: float64

Missing values: 1,808 (0.05%)

Distribution (non-missing):
  Zeros: 0 (0.00%)


  Negative: 0 (0.00%)
  Positive: 3,992,510 (100.00%)



Correlation with Price (valid rows): 0.0177



Top 10 most common decimal values:
                     mean        std  count
weight_decimal                             
0.000000        77.798619  39.576711  60677
0.898250        84.355394  40.679164   1677
0.908437        78.151331  38.040513   1562
0.898382        77.630059  38.767438   1433
0.824082        83.088466  37.932090   1270
0.817374        85.946458  37.501260   1165
0.837673        78.936614  39.913323   1124
0.829310        82.682448  40.529524   1057
0.879323        84.342518  38.987022   1012
0.814070        80.631850  41.312545    995


In [6]:
# Test different binning strategies for Weight Capacity
print("\n" + "="*60)
print("TESTING WEIGHT CAPACITY BINNING STRATEGIES")
print("="*60)

# Use only non-missing values for binning analysis
weight_for_binning = combined_train[combined_train[weight_col].notna()].copy()

# Remove zeros for binning analysis (they're a special case)
weight_nonzero = weight_for_binning[weight_for_binning[weight_col] != 0].copy()

print(f"Using {len(weight_nonzero):,} non-zero, non-missing rows for binning analysis")

binning_results = []

# Test different numbers of bins
for n_bins in [5, 8, 10, 15, 20, 30, 50]:
    # Equal-width binning
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    weight_nonzero[f'weight_bin_uniform_{n_bins}'] = discretizer.fit_transform(weight_nonzero[[weight_col]]).flatten()
    
    # Calculate target variance within bins
    bin_stats = weight_nonzero.groupby(f'weight_bin_uniform_{n_bins}')['Price'].agg(['mean', 'std', 'count'])
    binning_results.append({
        'strategy': 'uniform',
        'n_bins': n_bins,
        'avg_bin_std': bin_stats['std'].mean(),
        'min_bin_count': bin_stats['count'].min(),
        'price_range': bin_stats['mean'].max() - bin_stats['mean'].min()
    })
    
    # Quantile binning
    discretizer_q = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    weight_nonzero[f'weight_bin_quantile_{n_bins}'] = discretizer_q.fit_transform(weight_nonzero[[weight_col]]).flatten()
    
    bin_stats_q = weight_nonzero.groupby(f'weight_bin_quantile_{n_bins}')['Price'].agg(['mean', 'std', 'count'])
    binning_results.append({
        'strategy': 'quantile',
        'n_bins': n_bins,
        'avg_bin_std': bin_stats_q['std'].mean(),
        'min_bin_count': bin_stats_q['count'].min(),
        'price_range': bin_stats_q['mean'].max() - bin_stats_q['mean'].min()
    })

binning_df = pd.DataFrame(binning_results)
print("\nBinning strategy comparison:")
print(binning_df.sort_values('price_range', ascending=False).head(10))

# Best strategies based on price range (signal) and min bin count (stability)
best_uniform = binning_df[binning_df['strategy'] == 'uniform'].sort_values('price_range', ascending=False).iloc[0]
best_quantile = binning_df[binning_df['strategy'] == 'quantile'].sort_values('price_range', ascending=False).iloc[0]

print(f"\nBest uniform binning: {best_uniform['n_bins']} bins (price range: {best_uniform['price_range']:.2f})")
print(f"Best quantile binning: {best_quantile['n_bins']} bins (price range: {best_quantile['price_range']:.2f})")


TESTING WEIGHT CAPACITY BINNING STRATEGIES


Using 3,992,510 non-zero, non-missing rows for binning analysis



Binning strategy comparison:
    strategy  n_bins  avg_bin_std  min_bin_count  price_range
12   uniform      50    38.907152          14607     8.848311
13  quantile      50    38.913814          76503     5.998738
10   uniform      30    38.903957          80724     5.704248
8    uniform      20    38.907561         115443     5.241227
11  quantile      30    38.915683         129316     5.232910
9   quantile      20    38.919904         195303     4.266600
6    uniform      15    38.910155         161557     4.259689
7   quantile      15    38.920063         263684     3.746829
4    uniform      10    38.910452         291492     3.464889
2    uniform       8    38.913851         395057     3.187601

Best uniform binning: 50 bins (price range: 8.85)
Best quantile binning: 50 bins (price range: 6.00)


In [7]:
# Analyze categorical features for target encoding potential
print("\n" + "="*60)
print("CATEGORICAL FEATURE ANALYSIS FOR TARGET ENCODING")
print("="*60)

cat_features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 
                'Waterproof', 'Style', 'Color']

target_encoding_analysis = []

for feature in cat_features:
    # Basic stats
    n_unique = combined_train[feature].nunique()
    value_counts = combined_train[feature].value_counts()
    
    # Calculate target statistics
    target_stats = combined_train.groupby(feature)['Price'].agg(['mean', 'std', 'count'])
    
    # Check stability (correlation between two halves)
    half1, half2 = combined_train.sample(frac=0.5, random_state=42), combined_train.sample(frac=0.5, random_state=43)
    means1 = half1.groupby(feature)['Price'].mean()
    means2 = half2.groupby(feature)['Price'].mean()
    common_cats = means1.index.intersection(means2.index)
    
    if len(common_cats) > 5:
        stability_corr = means1[common_cats].corr(means2[common_cats])
    else:
        stability_corr = np.nan
    
    # Categories with sufficient samples (>100)
    sufficient_cats = (target_stats['count'] >= 100).sum()
    
    target_encoding_analysis.append({
        'feature': feature,
        'n_unique': n_unique,
        'sufficient_cats': sufficient_cats,
        'stability_corr': stability_corr,
        'price_range': target_stats['mean'].max() - target_stats['mean'].min(),
        'max_count': value_counts.iloc[0],
        'min_count': value_counts.iloc[-1]
    })

te_df = pd.DataFrame(target_encoding_analysis)
print("\nTarget encoding potential:")
print(te_df.sort_values('price_range', ascending=False))

# Features with highest encoding potential
best_for_te = te_df.nlargest(3, 'price_range')['feature'].tolist()
print(f"\nTop 3 features for target encoding: {best_for_te}")


CATEGORICAL FEATURE ANALYSIS FOR TARGET ENCODING



Target encoding potential:
              feature  n_unique  sufficient_cats  stability_corr  price_range  \
6               Color         6                6         0.99631     1.926271   
1            Material         4                4             NaN     1.540674   
0               Brand         5                5             NaN     1.537524   
2                Size         3                3             NaN     0.410370   
5               Style         3                3             NaN     0.314476   
3  Laptop Compartment         2                2             NaN     0.069704   
4          Waterproof         2                2             NaN     0.027099   

   max_count  min_count  
6     688257     617024  
1    1060882     903632  
0     801035     749340  
2    1354487    1239751  
5    1329677    1262519  
3    1972937    1922848  
4    1969205    1930789  

Top 3 features for target encoding: ['Color', 'Material', 'Brand']


In [8]:
# Test interaction features
print("\n" + "="*60)
print("INTERACTION FEATURE ANALYSIS")
print("="*60)

# Create interaction combinations
interactions = [
    ('Brand', 'Material'),
    ('Size', 'Style'),
    ('Brand', 'Size'),
    ('Material', 'Style'),
    ('Brand', 'Color'),
    ('Size', 'Color')
]

interaction_results = []

for feat1, feat2 in interactions:
    # Create interaction
    interaction_name = f"{feat1}_{feat2}"
    combined_train[interaction_name] = combined_train[feat1].astype(str) + '_' + combined_train[feat2].astype(str)
    
    # Stats
    n_unique = combined_train[interaction_name].nunique()
    target_stats = combined_train.groupby(interaction_name)['Price'].agg(['mean', 'std', 'count'])
    
    # Filter for combinations with sufficient data
    sufficient_combos = target_stats[target_stats['count'] >= 50]
    
    if len(sufficient_combos) > 5:
        price_range = sufficient_combos['mean'].max() - sufficient_combos['mean'].min()
        avg_std = sufficient_combos['std'].mean()
        
        interaction_results.append({
            'interaction': interaction_name,
            'n_unique': n_unique,
            'sufficient_combos': len(sufficient_combos),
            'price_range': price_range,
            'avg_std': avg_std,
            'signal_score': price_range / avg_std if avg_std > 0 else 0
        })

interaction_df = pd.DataFrame(interaction_results)
print("\nInteraction feature analysis:")
print(interaction_df.sort_values('signal_score', ascending=False))

# Best interactions
best_interactions = interaction_df.nlargest(3, 'signal_score')['interaction'].tolist()
print(f"\nTop 3 interactions: {best_interactions}")


INTERACTION FEATURE ANALYSIS



Interaction feature analysis:
      interaction  n_unique  sufficient_combos  price_range    avg_std  \
2      Brand_Size        24                 24    19.350779  39.453170   
5      Size_Color        28                 28     6.574969  39.234953   
1      Size_Style        16                 16     4.610007  39.762607   
3  Material_Style        20                 20     4.551122  39.348497   
4     Brand_Color        42                 42     3.402197  39.054657   
0  Brand_Material        30                 30     3.172758  39.130819   

   signal_score  
2      0.490475  
5      0.167579  
1      0.115938  
3      0.115662  
4      0.087114  
0      0.081081  

Top 3 interactions: ['Brand_Size', 'Size_Color', 'Size_Style']


In [None]:
# Test count encoding potential
print("\n" + "="*60)
print("COUNT ENCODING ANALYSIS")
print("="*60)

# Count encoding can capture popularity/rarity which might correlate with price
for feature in cat_features:
    # Calculate counts
    counts = combined_train[feature].value_counts()
    combined_train[f'{feature}_count'] = combined_train[feature].map(counts)
    
    # Correlation with target
    corr_with_target = combined_train[f'{feature}_count'].corr(combined_train['Price'])
    
    # Unique count values
    n_unique_counts = combined_train[f'{feature}_count'].nunique()
    
    print(f"{feature:20s}: corr={corr_with_target:6.4f}, unique_counts={n_unique_counts:4d}")

# Count encoding looks promising for features with reasonable correlation

In [None]:
# Summary of findings for next experiment
print("\n" + "="*60)
print("KEY FINDINGS FOR EXPERIMENT DESIGN")
print("="*60)

print("\n1. WEIGHT CAPACITY FEATURE ENGINEERING:")
print(f"   - Best uniform binning: {best_uniform['n_bins']} bins")
print(f"   - Best quantile binning: {best_quantile['n_bins']} bins")
print(f"   - Decimal patterns exist and should be captured")
print(f"   - Zero and negative values need special handling")

print("\n2. TARGET ENCODING PRIORITY:")
for i, feat in enumerate(best_for_te, 1):
    price_range = te_df[te_df['feature'] == feat]['price_range'].iloc[0]
    print(f"   {i}. {feat}: price range {price_range:.2f}")

print("\n3. INTERACTION FEATURES:")
for i, interaction in enumerate(best_interactions, 1):
    signal_score = interaction_df[interaction_df['interaction'] == interaction]['signal_score'].iloc[0]
    print(f"   {i}. {interaction}: signal score {signal_score:.2f}")

print("\n4. COUNT ENCODING:")
print("   - Should be applied to all categorical features")
print("   - Captures popularity/rarity patterns")

print("\n5. NEXT STEPS:")
print("   - Implement k-fold target encoding with smoothing")
print("   - Create interaction features with proper cross-validation")
print("   - Add histogram and quantile bins for Weight Capacity")
print("   - Include count encoding for all categoricals")
print("   - Tune hyperparameters (lower learning rate, more trees)")