# Experiment 005: Cleaned Feature Names + Histogram Binning

## Objectives
1. Fix feature naming issues (remove special characters)
2. Add histogram binning (1st place technique - 50 bins)
3. Apply to multiple group keys
4. Monitor feature importance to validate features are being used

## Expected Improvements
- Cleaned names: Enable proper feature importance tracking
- Histogram bins: Capture full price distribution within groups
- Multiple keys: Brand, Material, Size, Color, Weight Capacity
- Target: Beat 38.660840 (exp_004) and close gap to 38.616280

In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
train = pd.read_csv('/home/data/train.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')
test = pd.read_csv('/home/data/test.csv')
combined_train = pd.concat([train, training_extra], ignore_index=True)

print(f"Train shape: {train.shape}")
print(f"Training extra shape: {training_extra.shape}")
print(f"Combined train shape: {combined_train.shape}")
print(f"Test shape: {test.shape}")

# Separate features and target
y = combined_train['Price'].values
print(f"Target shape: {y.shape}")

Loading data...


Train shape: (300000, 11)
Training extra shape: (3694318, 11)
Combined train shape: (3994318, 11)
Test shape: (200000, 10)
Target shape: (3994318,)


## 1. Cleaned Baseline Features

Remove special characters from feature names to enable proper importance tracking.

In [6]:
def create_cleaned_baseline_features(df):
    """Create baseline features with cleaned names (no special characters)"""
    features = pd.DataFrame(index=df.index)
    
    # Weight Capacity features
    if 'Weight Capacity (kg)' in df.columns:
        weight = df['Weight Capacity (kg)'].copy()
        
        # Original
        features['weight_capacity'] = weight
        
        # Rounding (7-10 decimals)
        features['weight_round_7'] = weight.round(7)
        features['weight_round_8'] = weight.round(8)
        features['weight_round_9'] = weight.round(9)
        features['weight_round_10'] = weight.round(10)
        
        # Digit extraction (handle NaN properly)
        weight_str = weight.astype(str)
        
        # Extract digits before decimal
        features['weight_int_part'] = weight.fillna(0).astype(int)
        
        # Extract first few decimals as integers
        decimals = weight_str.str.split('.').str[1].fillna('0')
        features['weight_dec_1'] = decimals.str[:1].fillna('0').astype(int)
        features['weight_dec_2'] = decimals.str[:2].fillna('0').astype(int)
    
    # Label encode categorical features
    categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
    
    for col in categorical_cols:
        if col in df.columns:
            # Clean column name for feature
            clean_name = col.lower().replace(' ', '_')
            features[f'{clean_name}_encoded'] = df[col].astype('category').cat.codes
    
    return features

print("Creating cleaned baseline features...")
train_baseline = create_cleaned_baseline_features(combined_train)
test_baseline = create_cleaned_baseline_features(test)

print(f"Baseline features shape: {train_baseline.shape}")
print(f"Baseline feature names: {list(train_baseline.columns)}")

Creating cleaned baseline features...


Baseline features shape: (3994318, 15)
Baseline feature names: ['weight_capacity', 'weight_round_7', 'weight_round_8', 'weight_round_9', 'weight_round_10', 'weight_int_part', 'weight_dec_1', 'weight_dec_2', 'brand_encoded', 'material_encoded', 'size_encoded', 'laptop_compartment_encoded', 'waterproof_encoded', 'style_encoded', 'color_encoded']


## 2. Groupby Statistics with Cleaned Names

Recreate groupby statistics from exp_004 but with cleaned feature names.

In [7]:
def create_groupby_statistics(df, target_col='Price'):
    """Create groupby statistics with cleaned feature names"""
    features = pd.DataFrame(index=df.index)
    
    # Group keys to use
    group_keys = [
        'Weight Capacity (kg)',
        'Brand', 
        'Material', 
        'Size',
        'Laptop Compartment',
        'Waterproof', 
        'Style',
        'Color'
    ]
    
    # Statistics to compute
    stats = ['mean', 'std', 'count', 'min', 'max', 'median']
    
    # Check if this is training data (has target) or test data
    is_training = target_col in df.columns
    
    if is_training:
        # Training data - use nested CV to prevent leakage
        n_folds = 5
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        # Initialize storage for statistics
        for key in group_keys:
            if key in df.columns:
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                for stat in stats:
                    feature_name = f'{clean_key}_{stat}_price'
                    features[feature_name] = np.nan
        
        # Compute statistics using nested CV
        for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
            df_train = df.iloc[train_idx]
            df_val = df.iloc[val_idx]
            
            for key in group_keys:
                if key not in df.columns:
                    continue
                    
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                
                # Compute statistics on training fold
                grouped = df_train.groupby(key)[target_col]
                stat_funcs = {
                    'mean': grouped.mean(),
                    'std': grouped.std(),
                    'count': grouped.count(),
                    'min': grouped.min(),
                    'max': grouped.max(),
                    'median': grouped.median()
                }
                
                # Map to validation fold
                for stat in stats:
                    feature_name = f'{clean_key}_{stat}_price'
                    features.loc[df_val.index, feature_name] = df_val[key].map(stat_funcs[stat])
        
        # Fill any remaining NaN values (should be minimal)
        features = features.fillna(features.mean())
    else:
        # Test data - use statistics from full training data
        # For now, return empty features - we'll compute this after training
        # Or we can pass training data as a parameter
        pass
    
    return features

In [8]:
def create_histogram_bins(df, target_col='Price', n_bins=50, training_df=None):
    """Create histogram bins of target distribution within groups"""
    features = pd.DataFrame(index=df.index)
    
    # Group keys to apply histogram binning
    group_keys = [
        'Weight Capacity (kg)',
        'Brand',
        'Material', 
        'Size',
        'Color'
    ]
    
    # Check if this is training data
    is_training = target_col in df.columns
    
    if is_training:
        # Training data - use nested CV
        n_folds = 5
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        # Initialize storage for histogram features
        for key in group_keys:
            if key in df.columns:
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                for i in range(n_bins):
                    feature_name = f'{clean_key}_hist_bin_{i}'
                    features[feature_name] = 0.0
        
        # Compute histograms using nested CV
        for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
            df_train = df.iloc[train_idx]
            df_val = df.iloc[val_idx]
            
            for key in group_keys:
                if key not in df.columns:
                    continue
                
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                
                # Create histogram bins based on training data
                # Use percentiles to create bins
                train_prices = df_train[target_col].values
                bin_edges = np.percentile(train_prices, np.linspace(0, 100, n_bins + 1))
                
                # For each group in training data, compute histogram
                group_hist = {}
                for group_val, group_data in df_train.groupby(key):
                    hist, _ = np.histogram(group_data[target_col].values, bins=bin_edges)
                    # Normalize to get proportions
                    hist = hist / len(group_data) if len(group_data) > 0 else np.zeros(n_bins)
                    group_hist[group_val] = hist
                
                # Map to validation data
                for i in range(n_bins):
                    feature_name = f'{clean_key}_hist_bin_{i}'
                    # Get histogram for each validation row's group
                    val_hists = df_val[key].map(lambda x: group_hist.get(x, np.zeros(n_bins)))
                    features.loc[df_val.index, feature_name] = val_hists.apply(lambda x: x[i])
        
        # Fill NaN values if any
        features = features.fillna(0)
    else:
        # Test data - need training data to compute histograms
        if training_df is None:
            raise ValueError("For test data, training_df parameter is required")
        
        # Use full training data to compute histograms
        for key in group_keys:
            if key not in df.columns:
                continue
            
            clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
            
            # Create histogram bins based on full training data
            train_prices = training_df[target_col].values
            bin_edges = np.percentile(train_prices, np.linspace(0, 100, n_bins + 1))
            
            # For each group in training data, compute histogram
            group_hist = {}
            for group_val, group_data in training_df.groupby(key):
                hist, _ = np.histogram(group_data[target_col].values, bins=bin_edges)
                hist = hist / len(group_data) if len(group_data) > 0 else np.zeros(n_bins)
                group_hist[group_val] = hist
            
            # Map to test data
            for i in range(n_bins):
                feature_name = f'{clean_key}_hist_bin_{i}'
                test_hists = df[key].map(lambda x: group_hist.get(x, np.zeros(n_bins)))
                features[feature_name] = test_hists.apply(lambda x: x[i])
        
        features = features.fillna(0)
    
    return features

## 4. Combine All Features

In [13]:
# Combine all feature sets
print("Combining all features...")

# Start with baseline features
X_train = train_baseline.copy()
X_test = test_baseline.copy()

# Add groupby statistics
X_train = pd.concat([X_train, train_groupby], axis=1)
X_test = pd.concat([X_test, test_groupby], axis=1)

# Add histogram bins
X_train = pd.concat([X_train, train_histogram], axis=1)
X_test = pd.concat([X_test, test_histogram], axis=1)

print(f"Final training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")
print(f"Feature names (first 20): {list(X_train.columns)[:20]}")

# Check for any issues
print(f"\nNaN counts in training features:")
print(X_train.isnull().sum().sum())
print(f"NaN counts in test features:")
print(X_test.isnull().sum().sum())

Combining all features...


Final training features shape: (3994318, 313)
Final test features shape: (200000, 313)
Feature names (first 20): ['weight_capacity', 'weight_round_7', 'weight_round_8', 'weight_round_9', 'weight_round_10', 'weight_int_part', 'weight_dec_1', 'weight_dec_2', 'brand_encoded', 'material_encoded', 'size_encoded', 'laptop_compartment_encoded', 'waterproof_encoded', 'style_encoded', 'color_encoded', 'weight_capacity_kg_mean_price', 'weight_capacity_kg_std_price', 'weight_capacity_kg_count_price', 'weight_capacity_kg_min_price', 'weight_capacity_kg_max_price']

NaN counts in training features:


9040
NaN counts in test features:
385


## 5. Model Training with Feature Importance Monitoring

In [14]:
# 20-fold CV
print("Starting 20-fold CV...")
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
feature_importance_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\nFold {fold+1}/{n_folds}")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Parameters
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.05,
        'max_depth': 8,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': 42
    }
    
    # Train
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    # Predict
    val_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_scores.append(rmse)
    print(f"Fold {fold+1} RMSE: {rmse:.6f}")
    
    # Extract feature importance
    importance_dict = model.get_score(importance_type='weight')
    feature_importance_list.append(importance_dict)

# Print CV results
print(f"\n{'='*60}")
print(f"CV Results: {len(fold_scores)} folds")
print(f"Mean RMSE: {np.mean(fold_scores):.6f}")
print(f"Std RMSE: {np.std(fold_scores):.6f}")
print(f"All scores: {[f'{s:.6f}' for s in fold_scores]}")
print(f"{'='*60}")

Starting 20-fold CV...

Fold 1/20


Fold 1 RMSE: 38.634479

Fold 2/20


Fold 2 RMSE: 38.680188

Fold 3/20


Fold 3 RMSE: 38.584290

Fold 4/20


Fold 4 RMSE: 38.664701

Fold 5/20


Fold 5 RMSE: 38.594852

Fold 6/20


Fold 6 RMSE: 38.666184

Fold 7/20


Fold 7 RMSE: 38.668990

Fold 8/20


Fold 8 RMSE: 38.604579

Fold 9/20


Fold 9 RMSE: 38.658480

Fold 10/20


Fold 10 RMSE: 38.645248

Fold 11/20


Fold 11 RMSE: 38.765205

Fold 12/20


Fold 12 RMSE: 38.730496

Fold 13/20


Fold 13 RMSE: 38.669690

Fold 14/20


Fold 14 RMSE: 38.678289

Fold 15/20


Fold 15 RMSE: 38.608956

Fold 16/20


Fold 16 RMSE: 38.643278

Fold 17/20


Fold 17 RMSE: 38.608990

Fold 18/20


Fold 18 RMSE: 38.732391

Fold 19/20


Fold 19 RMSE: 38.712584

Fold 20/20


Fold 20 RMSE: 38.716040

CV Results: 20 folds
Mean RMSE: 38.663395
Std RMSE: 0.048650
All scores: ['38.634479', '38.680188', '38.584290', '38.664701', '38.594852', '38.666184', '38.668990', '38.604579', '38.658480', '38.645248', '38.765205', '38.730496', '38.669690', '38.678289', '38.608956', '38.643278', '38.608990', '38.732391', '38.712584', '38.716040']


## 6. Feature Importance Analysis

In [15]:
# Aggregate feature importance across folds
print("Analyzing feature importance...")

# Sum importance across all folds
total_importance = {}
for importance_dict in feature_importance_list:
    for feat, imp in importance_dict.items():
        total_importance[feat] = total_importance.get(feat, 0) + imp

# Sort by importance
sorted_importance = sorted(total_importance.items(), key=lambda x: x[1], reverse=True)

print(f"\nTop 20 features by importance:")
for i, (feat, imp) in enumerate(sorted_importance[:20]):
    print(f"{i+1:2d}. {feat}: {imp:.2f}")

# Analyze by feature type
print(f"\n{'='*60}")
print("Feature importance by category:")
print(f"{'='*60}")

categories = {
    'Baseline (weight)': [f for f, _ in sorted_importance if f.startswith('weight_') and not any(x in f for x in ['mean', 'std', 'count', 'min', 'max', 'median', 'hist'])],
    'Baseline (categorical)': [f for f, _ in sorted_importance if f.endswith('_encoded')],
    'Groupby statistics': [f for f, _ in sorted_importance if any(x in f for x in ['_mean_', '_std_', '_count_', '_min_', '_max_', '_median_'])],
    'Histogram bins': [f for f, _ in sorted_importance if '_hist_bin_' in f]
}

for category, features in categories.items():
    if features:
        total_imp = sum(total_importance[f] for f in features)
        print(f"{category:25s}: {len(features):3d} features, total importance = {total_imp:.2f}")
        # Show top 3 in each category
        top_features = sorted([(f, total_importance[f]) for f in features], key=lambda x: x[1], reverse=True)[:3]
        for feat, imp in top_features:
            print(f"  - {feat}: {imp:.2f}")
    else:
        print(f"{category:25s}: {len(features):3d} features")
    print()

# Check if all features have non-zero importance
all_features = set(X_train.columns)
features_with_importance = set(total_importance.keys())
zero_importance = all_features - features_with_importance

if zero_importance:
    print(f"WARNING: {len(zero_importance)} features have zero importance:")
    for feat in sorted(zero_importance):
        print(f"  - {feat}")
else:
    print(f"âœ“ All {len(all_features)} features have non-zero importance!")

Analyzing feature importance...

Top 20 features by importance:
 1. weight_capacity: 58096.00
 2. weight_dec_2: 35493.00
 3. weight_capacity_kg_mean_price: 35025.00
 4. weight_capacity_kg_std_price: 27833.00
 5. weight_capacity_kg_median_price: 27634.00
 6. weight_capacity_kg_count_price: 23756.00
 7. weight_capacity_kg_min_price: 18993.00
 8. weight_capacity_kg_hist_bin_49: 15794.00
 9. weight_capacity_kg_max_price: 14656.00
10. weight_capacity_kg_hist_bin_3: 13350.00
11. weight_capacity_kg_hist_bin_5: 13247.00
12. weight_capacity_kg_hist_bin_6: 13071.00
13. weight_capacity_kg_hist_bin_4: 13055.00
14. weight_capacity_kg_hist_bin_47: 12881.00
15. weight_round_7: 12828.00
16. weight_capacity_kg_hist_bin_45: 12762.00
17. weight_capacity_kg_hist_bin_2: 12562.00
18. weight_capacity_kg_hist_bin_43: 12558.00
19. weight_capacity_kg_hist_bin_7: 12494.00
20. weight_capacity_kg_hist_bin_44: 12468.00

Feature importance by category:
Baseline (weight)        :   8 features, total importance = 1234

## 7. Generate Predictions

In [16]:
# Train final model on all data
print("Training final model on all training data...")

dtrain_full = xgb.DMatrix(X_train, label=y)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

final_model = xgb.train(
    params,
    dtrain_full,
    num_boost_round=2000
)

# Predict on test set
dtest = xgb.DMatrix(X_test)
test_pred = final_model.predict(dtest)

print(f"Test predictions shape: {test_pred.shape}")
print(f"Test predictions range: {test_pred.min():.2f} - {test_pred.max():.2f}")

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Price': test_pred
})

submission_path = '/home/code/submission_candidates/candidate_005.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")
print(submission.head())

Training final model on all training data...


Test predictions shape: (200000,)
Test predictions range: 34.92 - 130.60

Submission saved to: /home/code/submission_candidates/candidate_005.csv
       id      Price
0  300000  82.253654
1  300001  83.090744
2  300002  88.163376
3  300003  78.337013
4  300004  78.822548


In [9]:
# Create features for training data
print("Creating features for training data...")

# Baseline features
train_baseline = create_cleaned_baseline_features(combined_train)
print(f"Training baseline features shape: {train_baseline.shape}")

# Groupby statistics
train_groupby = create_groupby_statistics(combined_train, target_col='Price')
print(f"Training groupby features shape: {train_groupby.shape}")

# Histogram bins (50 bins)
train_histogram = create_histogram_bins(combined_train, target_col='Price', n_bins=50)
print(f"Training histogram features shape: {train_histogram.shape}")

Creating features for training data...


Training baseline features shape: (3994318, 15)


Training groupby features shape: (3994318, 48)


Training histogram features shape: (3994318, 250)


In [10]:
# Create features for test data
print("Creating features for test data...")

# Baseline features
test_baseline = create_cleaned_baseline_features(test)
print(f"Test baseline features shape: {test_baseline.shape}")

# For test data, we need to pass the training data to compute statistics
# Let's create a simplified version for test data that uses full training stats

def create_groupby_statistics_test(df, training_df, target_col='Price'):
    """Create groupby statistics for test data using full training statistics"""
    features = pd.DataFrame(index=df.index)
    
    group_keys = [
        'Weight Capacity (kg)',
        'Brand', 
        'Material', 
        'Size',
        'Laptop Compartment',
        'Waterproof', 
        'Style',
        'Color'
    ]
    
    stats = ['mean', 'std', 'count', 'min', 'max', 'median']
    
    for key in group_keys:
        if key in df.columns and key in training_df.columns:
            clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
            
            # Compute statistics on full training data
            grouped = training_df.groupby(key)[target_col]
            stat_funcs = {
                'mean': grouped.mean(),
                'std': grouped.std(),
                'count': grouped.count(),
                'min': grouped.min(),
                'max': grouped.max(),
                'median': grouped.median()
            }
            
            # Map to test data
            for stat in stats:
                feature_name = f'{clean_key}_{stat}_price'
                features[feature_name] = df[key].map(stat_funcs[stat])
    
    # Fill NaN values
    features = features.fillna(features.mean())
    return features

# Create groupby statistics for test data
test_groupby = create_groupby_statistics_test(test, combined_train, target_col='Price')
print(f"Test groupby features shape: {test_groupby.shape}")

# Histogram bins for test data (pass training data)
test_histogram = create_histogram_bins(test, target_col='Price', n_bins=50, training_df=combined_train)
print(f"Test histogram features shape: {test_histogram.shape}")

Creating features for test data...


Test baseline features shape: (200000, 15)


Test groupby features shape: (200000, 48)


Test histogram features shape: (200000, 250)
