# Experiment 005: Cleaned Feature Names + Histogram Binning

## Objectives
1. Fix feature naming issues (remove special characters)
2. Add histogram binning (1st place technique - 50 bins)
3. Apply to multiple group keys
4. Monitor feature importance to validate features are being used

## Expected Improvements
- Cleaned names: Enable proper feature importance tracking
- Histogram bins: Capture full price distribution within groups
- Multiple keys: Brand, Material, Size, Color, Weight Capacity
- Target: Beat 38.660840 (exp_004) and close gap to 38.616280

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
train = pd.read_csv('/home/data/train.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')
test = pd.read_csv('/home/data/test.csv')
combined_train = pd.concat([train, training_extra], ignore_index=True)

print(f"Train shape: {train.shape}")
print(f"Training extra shape: {training_extra.shape}")
print(f"Combined train shape: {combined_train.shape}")
print(f"Test shape: {test.shape}")

# Separate features and target
y = combined_train['Price'].values
print(f"Target shape: {y.shape}")

Loading data...


Train shape: (300000, 11)
Training extra shape: (3694318, 11)
Combined train shape: (3994318, 11)
Test shape: (200000, 10)
Target shape: (3994318,)


## 1. Cleaned Baseline Features

Remove special characters from feature names to enable proper importance tracking.

In [2]:
def create_cleaned_baseline_features(df):
    """Create baseline features with cleaned names (no special characters)"""
    features = pd.DataFrame(index=df.index)
    
    # Weight Capacity features
    if 'Weight Capacity (kg)' in df.columns:
        weight = df['Weight Capacity (kg)'].copy()
        
        # Original
        features['weight_capacity'] = weight
        
        # Rounding (7-10 decimals)
        features['weight_round_7'] = weight.round(7)
        features['weight_round_8'] = weight.round(8)
        features['weight_round_9'] = weight.round(9)
        features['weight_round_10'] = weight.round(10)
        
        # Digit extraction (handle NaN properly)
        weight_str = weight.astype(str)
        
        # Extract digits before decimal
        features['weight_int_part'] = weight.fillna(0).astype(int)
        
        # Extract first few decimals as integers
        decimals = weight_str.str.split('.').str[1].fillna('0')
        features['weight_dec_1'] = decimals.str[:1].fillna('0').astype(int)
        features['weight_dec_2'] = decimals.str[:2].fillna('0').astype(int)
    
    # Label encode categorical features
    categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
    
    for col in categorical_cols:
        if col in df.columns:
            # Clean column name for feature
            clean_name = col.lower().replace(' ', '_')
            features[f'{clean_name}_encoded'] = df[col].astype('category').cat.codes
    
    return features

print("Creating cleaned baseline features...")
train_baseline = create_cleaned_baseline_features(combined_train)
test_baseline = create_cleaned_baseline_features(test)

print(f"Baseline features shape: {train_baseline.shape}")
print(f"Baseline feature names: {list(train_baseline.columns)}")

Creating cleaned baseline features...


Baseline features shape: (3994318, 15)
Baseline feature names: ['weight_capacity', 'weight_round_7', 'weight_round_8', 'weight_round_9', 'weight_round_10', 'weight_int_part', 'weight_dec_1', 'weight_dec_2', 'brand_encoded', 'material_encoded', 'size_encoded', 'laptop_compartment_encoded', 'waterproof_encoded', 'style_encoded', 'color_encoded']


## 2. Groupby Statistics with Cleaned Names

Recreate groupby statistics from exp_004 but with cleaned feature names.

In [None]:
def create_groupby_statistics(df, target_col='Price'):
    """Create groupby statistics with cleaned feature names"""
    features = pd.DataFrame(index=df.index)
    
    # Group keys to use
    group_keys = [
        'Weight Capacity (kg)',
        'Brand', 
        'Material', 
        'Size',
        'Laptop Compartment',
        'Waterproof', 
        'Style',
        'Color'
    ]
    
    # Statistics to compute
    stats = ['mean', 'std', 'count', 'min', 'max', 'median']
    
    # Check if this is training data (has target) or test data
    is_training = target_col in df.columns
    
    if is_training:
        # Training data - use nested CV to prevent leakage
        n_folds = 5
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        # Initialize storage for statistics
        for key in group_keys:
            if key in df.columns:
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                for stat in stats:
                    feature_name = f'{clean_key}_{stat}_price'
                    features[feature_name] = np.nan
        
        # Compute statistics using nested CV
        for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
            df_train = df.iloc[train_idx]
            df_val = df.iloc[val_idx]
            
            for key in group_keys:
                if key not in df.columns:
                    continue
                    
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                
                # Compute statistics on training fold
                grouped = df_train.groupby(key)[target_col]
                stat_funcs = {
                    'mean': grouped.mean(),
                    'std': grouped.std(),
                    'count': grouped.count(),
                    'min': grouped.min(),
                    'max': grouped.max(),
                    'median': grouped.median()
                }
                
                # Map to validation fold
                for stat in stats:
                    feature_name = f'{clean_key}_{stat}_price'
                    features.loc[df_val.index, feature_name] = df_val[key].map(stat_funcs[stat])
        
        # Fill any remaining NaN values (should be minimal)
        features = features.fillna(features.mean())
    else:
        # Test data - use statistics from full training data
        # For now, return empty features - we'll compute this after training
        # Or we can pass training data as a parameter
        pass
    
    return features

In [None]:
def create_histogram_bins(df, target_col='Price', n_bins=50, is_test=False):
    """Create histogram bins of target distribution within groups"""
    features = pd.DataFrame(index=df.index)
    
    # Group keys to apply histogram binning
    group_keys = [
        'Weight Capacity (kg)',
        'Brand',
        'Material', 
        'Size',
        'Color'
    ]
    
    if is_test:
        # For test data, compute histograms from full training data
        # We'll pass the training data as a parameter
        raise ValueError("For test data, use create_histogram_bins_test with training data")
    else:
        # Training data - use nested CV
        # Create nested CV folds
        n_folds = 5
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        # Initialize storage for histogram features
        hist_features = {}
        
        for key in group_keys:
            if key not in df.columns:
                continue
                
            clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
            
            # Initialize arrays for each bin
            for i in range(n_bins):
                feature_name = f'{clean_key}_hist_bin_{i}'
                hist_features[feature_name] = np.zeros(len(df))
        
        # Compute histograms using nested CV
        for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
            print(f"Computing histogram bins for fold {fold+1}/{n_folds}...")
            
            df_train = df.iloc[train_idx]
            df_val = df.iloc[val_idx]
            
            for key in group_keys:
                if key not in df.columns:
                    continue
                    
                clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                
                # Group by key and compute histograms on training data
                grouped = df_train.groupby(key)[target_col]
                
                # Get global min/max for bin edges
                global_min = df_train[target_col].min()
                global_max = df_train[target_col].max()
                bin_edges = np.linspace(global_min, global_max, n_bins + 1)
                
                # For each group, compute histogram
                for group_val, group_series in grouped:
                    # Compute histogram counts
                    counts, _ = np.histogram(group_series, bins=bin_edges)
                    
                    # Normalize to proportions (sum to 1)
                    if counts.sum() > 0:
                        counts = counts / counts.sum()
                    
                    # Apply to validation rows with this group value
                    mask = df_val[key] == group_val
                    val_indices = df_val[mask].index
                    
                    for i in range(n_bins):
                        feature_name = f'{clean_key}_hist_bin_{i}'
                        hist_features[feature_name][val_indices] = counts[i]
        
        # Convert to DataFrame
        for name, values in hist_features.items():
            features[name] = values
    
    return features

def create_histogram_bins_test(train_df, test_df, target_col='Price', n_bins=50):
    """Create histogram bins for test data using training data"""
    features = pd.DataFrame(index=test_df.index)
    
    # Group keys to apply histogram binning
    group_keys = [
        'Weight Capacity (kg)',
        'Brand',
        'Material', 
        'Size',
        'Color'
    ]
    
    print("Computing histogram bins for test data using full training data...")
    
    # Get global min/max from training data
    global_min = train_df[target_col].min()
    global_max = train_df[target_col].max()
    bin_edges = np.linspace(global_min, global_max, n_bins + 1)
    
    for key in group_keys:
        if key not in train_df.columns or key not in test_df.columns:
            continue
            
        clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
        
        # Group by key and compute histograms on FULL training data
        grouped = train_df.groupby(key)[target_col]
        
        # For each group, compute histogram
        for group_val, group_series in grouped:
            # Compute histogram counts
            counts, _ = np.histogram(group_series, bins=bin_edges)
            
            # Normalize to proportions (sum to 1)
            if counts.sum() > 0:
                counts = counts / counts.sum()
            
            # Apply to test rows with this group value
            mask = test_df[key] == group_val
            test_indices = test_df[mask].index
            
            for i in range(n_bins):
                feature_name = f'{clean_key}_hist_bin_{i}'
                if feature_name not in features.columns:
                    features[feature_name] = 0.0
                features.loc[test_indices, feature_name] = counts[i]
        
        # Fill NaN for unseen groups with uniform distribution
        for i in range(n_bins):
            feature_name = f'{clean_key}_hist_bin_{i}'
            if feature_name not in features.columns:
                features[feature_name] = 1.0 / n_bins  # Uniform distribution
            else:
                features[feature_name] = features[feature_name].fillna(1.0 / n_bins)
    
    # Ensure all histogram bins exist for all keys
    for key in group_keys:
        if key not in train_df.columns:
            continue
            
        clean_key = key.lower().replace(' ', '_').replace('(', '').replace(')', '')
        for i in range(n_bins):
            feature_name = f'{clean_key}_hist_bin_{i}'
            if feature_name not in features.columns:
                features[feature_name] = 1.0 / n_bins
    
    return features

print("Creating histogram bins...")
train_histogram = create_histogram_bins(combined_train, n_bins=50)
test_histogram = create_histogram_bins_test(combined_train, test, n_bins=50)

print(f"Histogram features shape: {train_histogram.shape}")
print(f"Sample histogram features: {list(train_histogram.columns)[:10]}")

## 4. Combine All Features

In [None]:
# Combine all feature sets
print("Combining all features...")

# Start with baseline features
X_train = train_baseline.copy()
X_test = test_baseline.copy()

# Add groupby statistics
X_train = pd.concat([X_train, train_groupby], axis=1)
X_test = pd.concat([X_test, test_groupby], axis=1)

# Add histogram bins
X_train = pd.concat([X_train, train_histogram], axis=1)
X_test = pd.concat([X_test, test_histogram], axis=1)

print(f"Final training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")
print(f"Feature names (first 20): {list(X_train.columns)[:20]}")

# Check for any issues
print(f"\nNaN counts in training features:")
print(X_train.isnull().sum().sum())
print(f"NaN counts in test features:")
print(X_test.isnull().sum().sum())

## 5. Model Training with Feature Importance Monitoring

In [None]:
# 20-fold CV
print("Starting 20-fold CV...")
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
feature_importance_list = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\nFold {fold+1}/{n_folds}")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Parameters
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.05,
        'max_depth': 8,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': 42
    }
    
    # Train
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    # Predict
    val_pred = model.predict(dval)
    fold_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_scores.append(fold_rmse)
    
    print(f"Fold {fold+1} RMSE: {fold_rmse:.6f}")
    
    # Get feature importance
    importance = model.get_booster().get_score(importance_type='gain')
    feature_importance_list.append(importance)

# Overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\n{'='*60}")
print(f"CV RMSE: {cv_score:.6f} Â± {cv_std:.6f}")
print(f"Individual folds: {[f'{s:.6f}' for s in fold_scores]}")
print(f"Improvement over exp_004: {38.660840 - cv_score:.6f}")
print(f"Gap to target: {cv_score - 38.616280:.6f}")
print(f"{'='*60}")

## 6. Feature Importance Analysis

In [None]:
# Aggregate feature importance across folds
print("Analyzing feature importance...")

# Sum importance across all folds
total_importance = {}
for importance_dict in feature_importance_list:
    for feat, imp in importance_dict.items():
        total_importance[feat] = total_importance.get(feat, 0) + imp

# Sort by importance
sorted_importance = sorted(total_importance.items(), key=lambda x: x[1], reverse=True)

print(f"\nTop 20 features by importance:")
for i, (feat, imp) in enumerate(sorted_importance[:20]):
    print(f"{i+1:2d}. {feat}: {imp:.2f}")

# Analyze by feature type
print(f"\n{'='*60}")
print("Feature importance by category:")
print(f"{'='*60}")

categories = {
    'Baseline (weight)': [f for f, _ in sorted_importance if f.startswith('weight_') and not any(x in f for x in ['mean', 'std', 'count', 'min', 'max', 'median', 'hist'])],
    'Baseline (categorical)': [f for f, _ in sorted_importance if f.endswith('_encoded')],
    'Groupby statistics': [f for f, _ in sorted_importance if any(x in f for x in ['_mean_', '_std_', '_count_', '_min_', '_max_', '_median_'])],
    'Histogram bins': [f for f, _ in sorted_importance if 'hist_bin' in f]
}

for category, feats in categories.items():
    if feats:
        total_imp = sum(total_importance[f] for f in feats)
        print(f"{category:25s}: {len(feats):3d} features, total importance = {total_imp:.2f}")
    else:
        print(f"{category:25s}:   0 features, total importance = 0.00")

print(f"{'='*60}")

# Check if any features have zero importance
zero_importance = [f for f in X_train.columns if f not in total_importance]
print(f"\nFeatures with zero importance: {len(zero_importance)}")
if len(zero_importance) <= 10:
    print("Zero importance features:", zero_importance)
else:
    print("First 10 zero importance features:", zero_importance[:10])

## 7. Generate Predictions

In [None]:
# Train final model on all data
print("Training final model on all training data...")

dtrain_full = xgb.DMatrix(X_train, label=y)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

final_model = xgb.train(
    params,
    dtrain_full,
    num_boost_round=2000
)

# Predict on test set
dtest = xgb.DMatrix(X_test)
test_pred = final_model.predict(dtest)

print(f"Test predictions shape: {test_pred.shape}")
print(f"Test predictions range: {test_pred.min():.2f} - {test_pred.max():.2f}")

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Price': test_pred
})

submission_path = '/home/code/submission_candidates/candidate_005.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")
print(submission.head())