# Experiment 004: Groupby Statistics Features

## Goal
Implement groupby statistics following the winning solution pattern that showed 0.71 correlation vs <0.02 for simple transformations.

## Strategy
Based on analysis in evolver_loop4_analysis.ipynb:
- Groupby aggregations have 0.71 correlation with target
- Simple weight transformations have <0.02 correlation
- Winning pattern: `groupby(COL1)[COL2].agg(STAT)` where COL2 is the target

## Implementation
1. Groupby Weight Capacity: mean, std, count, min, max, median of Price
2. Groupby each categorical: mean, std, count of Price  
3. Use nested CV to prevent leakage
4. Merge back to train/test based on the group key

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

GPU available: True
GPU: NVIDIA H100 80GB HBM3
GPU Memory: 85.0 GB


## Load Data

In [2]:
# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Training extra shape: {training_extra.shape}")

# Combine train and training_extra for more data
combined_train = pd.concat([train, training_extra], ignore_index=True)
print(f"Combined train shape: {combined_train.shape}")

# Basic info
print(f"\nTarget stats:")
print(combined_train['Price'].describe())

Train shape: (300000, 11)
Test shape: (200000, 10)
Training extra shape: (3694318, 11)
Combined train shape: (3994318, 11)

Target stats:


count    3.994318e+06
mean     8.136217e+01
std      3.893868e+01
min      1.500000e+01
25%      4.747002e+01
50%      8.098495e+01
75%      1.148550e+02
max      1.500000e+02
Name: Price, dtype: float64


## Create Baseline Features (from exp_003)

In [3]:
def create_baseline_features(df):
    """Create proven baseline features from exp_003"""
    features = pd.DataFrame(index=df.index)
    
    # Weight Capacity features (proven to work)
    if 'Weight Capacity (kg)' in df.columns:
        weight = df['Weight Capacity (kg)'].copy()
        
        # Original value
        features['weight_original'] = weight
        
        # Rounding to different decimals (7-10 as per winning solutions)
        for dec in range(7, 11):
            features[f'weight_round_{dec}'] = np.round(weight, decimals=dec)
        
        # Digit extraction (1-5 digits) - handle NaN properly
        # Replace NaN with 0 for digit extraction
        weight_filled = weight.fillna(0)
        weight_str = weight_filled.astype(str).str.replace('.', '', regex=False)
        
        # Remove decimal point and get first 5 digits
        # Pad with zeros if needed
        weight_str = weight_str.str.pad(width=5, side='right', fillchar='0')
        
        for i in range(1, 6):
            features[f'weight_digit_{i}'] = weight_str.str[i-1].astype(float)
        
        # Integer and fractional parts
        features['weight_int'] = weight.fillna(0).astype(int)
        features['weight_frac'] = weight.fillna(0) - weight.fillna(0).astype(int)
    
    return features

# Create baseline features
X_baseline = create_baseline_features(combined_train)
X_test_baseline = create_baseline_features(test)

print(f"Baseline features shape: {X_baseline.shape}")
print(f"Baseline test features shape: {X_test_baseline.shape}")

Baseline features shape: (3994318, 12)
Baseline test features shape: (200000, 12)


## Label Encode Categoricals

In [None]:
# Label encode categorical features
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

# Fit label encoders on combined train + test data
label_encoders = {}
for col in categorical_cols:
    if col in combined_train.columns:
        le = LabelEncoder()
        # Fit on combined data to handle all categories
        combined_data = pd.concat([combined_train[col], test[col]], ignore_index=True)
        le.fit(combined_data.astype(str).fillna('missing'))
        
        # Transform train and test
        X_baseline[f'{col}_encoded'] = le.transform(combined_train[col].astype(str).fillna('missing'))
        X_test_baseline[f'{col}_encoded'] = le.transform(test[col].astype(str).fillna('missing'))
        
        label_encoders[col] = le

print(f"Added {len(categorical_cols)} label encoded features")
print(f"Total baseline features: {X_baseline.shape[1]}")

## Groupby Statistics - Core Implementation

In [None]:
def create_groupby_features(train_df, test_df, target_col='Price', n_folds=5):
    """
    Create groupby statistics features using nested CV to prevent leakage
    
    Pattern: groupby(COL1)[COL2].agg(STAT) where COL2 is the target
    """
    
    # Features dataframe
    features_train = pd.DataFrame(index=train_df.index)
    features_test = pd.DataFrame(index=test_df.index)
    
    # Group keys to test
    group_keys = ['Weight Capacity (kg)'] + categorical_cols
    
    # Statistics to compute
    stats = ['mean', 'std', 'count', 'min', 'max', 'median']
    
    for key in group_keys:
        print(f"Processing groupby features for: {key}")
        
        if key not in train_df.columns:
            continue
            
        # For test set, we'll use the full training data to compute statistics
        # For training set, use nested CV to prevent leakage
        
        # Test set features (use full training data - no leakage risk)
        test_group_stats = train_df.groupby(key)[target_col].agg(stats)
        test_group_stats.columns = [f'{key}_{stat}_price' for stat in stats]
        
        # Merge to test features
        test_merged = test_df[[key]].merge(
            test_group_stats, 
            left_on=key, 
            right_index=True, 
            how='left'
        )
        
        # Add to test features
        for col in test_group_stats.columns:
            features_test[col] = test_merged[col].values
        
        # Training set features (use nested CV)
        # Initialize columns
        for stat in stats:
            features_train[f'{key}_{stat}_price'] = np.nan
        
        # Create KFold for nested CV
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
            # Compute statistics on training portion only
            train_fold = train_df.iloc[train_idx]
            fold_group_stats = train_fold.groupby(key)[target_col].agg(stats)
            fold_group_stats.columns = [f'{key}_{stat}_price' for stat in stats]
            
            # Merge to validation portion
            val_fold = train_df.iloc[val_idx]
            val_merged = val_fold[[key]].merge(
                fold_group_stats,
                left_on=key,
                right_index=True,
                how='left'
            )
            
            # Assign to features_train at validation indices
            for col in fold_group_stats.columns:
                features_train.loc[val_idx, col] = val_merged[col].values
        
        # Fill any remaining NaN values (for keys not seen in training)
        for col in features_train.columns:
            if col.startswith(f'{key}_'):
                features_train[col] = features_train[col].fillna(train_df[target_col].mean())
                features_test[col] = features_test[col].fillna(train_df[target_col].mean())
    
    return features_train, features_test

print("Creating groupby statistics features...")
groupby_features_train, groupby_features_test = create_groupby_features(
    combined_train, test, target_col='Price', n_folds=5
)

print(f"Groupby features train shape: {groupby_features_train.shape}")
print(f"Groupby features test shape: {groupby_features_test.shape}")
print(f"\nSample groupby features:")
print(groupby_features_train.columns.tolist()[:10])

## Combine All Features

In [None]:
# Combine baseline and groupby features
X = pd.concat([X_baseline, groupby_features_train], axis=1)
X_test = pd.concat([X_test_baseline, groupby_features_test], axis=1)

print(f"Final training features shape: {X.shape}")
print(f"Final test features shape: {X_test.shape}")
print(f"\nFeature groups:")
print(f"- Baseline features: {X_baseline.shape[1]}")
print(f"- Groupby features: {groupby_features_train.shape[1]}")
print(f"- Total: {X.shape[1]}")

## Cross-Validation Setup

In [None]:
# Prepare data for CV
y = combined_train['Price'].values

# 20-fold CV (consistent with previous experiments)
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

print(f"Training data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"CV folds: {n_folds}")

## Train Model with Groupby Features

In [None]:
# XGBoost parameters (from winning solutions)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# Cross-validation
fold_scores = []
oof_predictions = np.zeros(len(X))

print("Starting 20-fold CV with groupby features...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    # Predict
    val_pred = model.predict(dval)
    oof_predictions[val_idx] = val_pred
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_scores.append(rmse)
    
    print(f"Fold {fold+1:2d}/{n_folds} - RMSE: {rmse:.6f}")

# Overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\n{'='*50}")
print(f"CV RMSE: {cv_score:.6f} ± {cv_std:.6f}")
print(f"{'='*50}")

# Compare to baseline
baseline_score = 38.825723
improvement = baseline_score - cv_score
print(f"Improvement over exp_003 baseline: {improvement:.6f}")
print(f"Target: 38.616280 (gap: {cv_score - 38.616280:.6f})")

## Feature Importance Analysis

In [None]:
# Get feature importance from last fold
importance = model.get_score(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': [importance.get(f'f{i}', 0) for i in range(len(feature_names))]
})

# Sort by importance
importance_df = importance_df.sort_values('importance', ascending=False)

print("Top 20 features by importance:")
print(importance_df.head(20))

# Check if groupby features are at the top
groupby_feature_importance = importance_df[importance_df['feature'].str.contains('_price')]
print(f"\nGroupby feature importance (top 10):")
print(groupby_feature_importance.head(10))

# Check for zero-variance features
zero_var_features = X.columns[X.nunique() <= 1]
if len(zero_var_features) > 0:
    print(f"\n⚠️  Zero variance features: {zero_var_features.tolist()}")
else:
    print("\n✓ No zero variance features detected")

## Generate Predictions

In [None]:
# Train final model on full data
dtrain_full = xgb.DMatrix(X, label=y)

final_model = xgb.train(
    params,
    dtrain_full,
    num_boost_round=2000,
    verbose_eval=False
)

# Predict on test
dtest = xgb.DMatrix(X_test)
test_predictions = final_model.predict(dtest)

# Clip predictions to training range
min_price = y.min()
max_price = y.max()
test_predictions = np.clip(test_predictions, min_price, max_price)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test predictions range: {test_predictions.min():.2f} - {test_predictions.max():.2f}")

## Create Submission

In [None]:
# Load sample submission
sample_sub = pd.read_csv('/home/data/sample_submission.csv')

# Create submission
submission = sample_sub.copy()
submission['Price'] = test_predictions

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Submission Price stats:")
print(submission['Price'].describe())

# Also save to candidates folder
candidate_path = '/home/code/submission_candidates/candidate_004.csv'
submission.to_csv(candidate_path, index=False)
print(f"\nCandidate also saved to: {candidate_path}")