# Enhanced Feature Engineering Experiment

Implementing comprehensive feature engineering based on winning solution strategies:
- Enhanced Weight Capacity features (50 bins, quantiles, rounding)
- Target encoding with nested CV (Color, Material, Brand)
- Categorical interactions (Brand_Size, Size_Color, Size_Style)
- Count encoding for all categoricals
- Hyperparameter tuning (lower LR, deeper trees)

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Load data
print("Loading training data...")
train1 = pd.read_csv('/home/data/train.csv')
train2 = pd.read_csv('/home/data/training_extra.csv')
train = pd.concat([train1, train2], ignore_index=True)

print(f"Combined training shape: {train.shape}")

# Load test data
test = pd.read_csv('/home/data/test.csv')
print(f"Test shape: {test.shape}")

# Identify features
cat_features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
target_col = 'Price'

Loading training data...


Combined training shape: (3994318, 11)
Test shape: (200000, 10)


In [3]:
# Handle missing values
print("Handling missing values...")
for col in cat_features:
    train[col] = train[col].fillna('Missing')
    test[col] = test[col].fillna('Missing')

# Numerical missing values
train['Compartments'] = train['Compartments'].fillna(train['Compartments'].median())
test['Compartments'] = test['Compartments'].fillna(train['Compartments'].median())

train['Weight Capacity (kg)'] = train['Weight Capacity (kg)'].fillna(train['Weight Capacity (kg)'].median())
test['Weight Capacity (kg)'] = test['Weight Capacity (kg)'].fillna(train['Weight Capacity (kg)'].median())

print("Missing values handled.")

Handling missing values...


Missing values handled.


In [4]:
# 1. Enhanced Weight Capacity Features (Highest Priority)
print("Creating enhanced Weight Capacity features...")

def create_weight_capacity_features(df):
    df = df.copy()
    wc = df['Weight Capacity (kg)']
    
    # 50 uniform bins (best from analysis)
    df['weight_bin_50'] = pd.cut(wc, bins=50, labels=False, retbins=False)
    
    # Rounding to 7-10 decimal places (winning solution pattern)
    for dec in [7, 8, 9, 10]:
        df[f'weight_round_{dec}'] = wc.round(dec)
    
    # Digit extraction (1-5 digits)
    for k in range(1, 6):
        df[f'weight_digit_{k}'] = ((wc * 10**k) % 10).fillna(-1)
    
    # Basic components
    df['weight_int'] = wc.astype(int)
    df['weight_frac'] = wc - df['weight_int']
    
    # Quantile features from training data
    quantiles = [0.25, 0.5, 0.75, 0.9]
    for q in quantiles:
        df[f'weight_q{int(q*100)}'] = wc.quantile(q)
    
    return df

train = create_weight_capacity_features(train)
test = create_weight_capacity_features(test)

print(f"Weight capacity features created. New shape: {train.shape}")

Creating enhanced Weight Capacity features...


Weight capacity features created. New shape: (3994318, 27)


In [5]:
# 2. Count Encoding (Medium Priority) - Do this before target encoding
print("Creating count encoding features...")

def create_count_encoding(df, df_test, cat_cols):
    """Create count encoding for categorical features"""
    df = df.copy()
    df_test = df_test.copy()
    
    for col in cat_cols:
        # Compute counts from combined train+test
        combined = pd.concat([df[col], df_test[col]], ignore_index=True)
        counts = combined.value_counts()
        
        df[f'{col}_count'] = df[col].map(counts)
        df_test[f'{col}_count'] = df_test[col].map(counts)
    
    return df, df_test

train, test = create_count_encoding(train, test, cat_features)
print("Count encoding completed.")

Creating count encoding features...


Count encoding completed.


In [6]:
# 3. Categorical Interaction Features (High Priority)
print("Creating categorical interaction features...")

def create_interaction_features(df, df_test):
    """Create interaction features with strong signals"""
    df = df.copy()
    df_test = df_test.copy()
    
    # Brand_Size: Strongest signal (0.49)
    df['Brand_Size'] = df['Brand'].astype(str) + '_' + df['Size'].astype(str)
    df_test['Brand_Size'] = df_test['Brand'].astype(str) + '_' + df_test['Size'].astype(str)
    
    # Size_Color: Moderate signal (0.17)
    df['Size_Color'] = df['Size'].astype(str) + '_' + df['Color'].astype(str)
    df_test['Size_Color'] = df_test['Size'].astype(str) + '_' + df_test['Color'].astype(str)
    
    # Size_Style: Moderate signal (0.12)
    df['Size_Style'] = df['Size'].astype(str) + '_' + df['Style'].astype(str)
    df_test['Size_Style'] = df_test['Size'].astype(str) + '_' + df_test['Style'].astype(str)
    
    return df, df_test

train, test = create_interaction_features(train, test)

# Add interaction features to cat_features list for encoding
interaction_features = ['Brand_Size', 'Size_Color', 'Size_Style']
all_cat_features = cat_features + interaction_features

print(f"Interaction features created: {interaction_features}")
print(f"Total categorical features: {len(all_cat_features)}")

Creating categorical interaction features...


Interaction features created: ['Brand_Size', 'Size_Color', 'Size_Style']
Total categorical features: 10


In [7]:
# 4. Target Encoding with Nested CV (High Priority)
print("Implementing target encoding with nested CV...")

def target_encode(train_df, test_df, cat_cols, target_col, n_folds=5):
    """
    Target encoding with nested CV to prevent leakage
    """
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Global target mean for smoothing
    global_mean = train_df[target_col].mean()
    
    for col in cat_cols:
        print(f"Target encoding {col}...")
        
        # Initialize encoded columns
        train_df[f'{col}_target_enc'] = 0
        test_df[f'{col}_target_enc'] = 0
        
        # Outer CV for training data
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        # Store OOF predictions for training data
        oof_encodings = np.zeros(len(train_df))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
            X_tr, X_val = train_df.iloc[train_idx], train_df.iloc[val_idx]
            
            # Compute target statistics on training fold
            target_stats = X_tr.groupby(col)[target_col].agg(['mean', 'count'])
            
            # Apply smoothing: (count * mean + alpha * global_mean) / (count + alpha)
            alpha = 100  # Smoothing parameter
            smoothed_mean = (target_stats['count'] * target_stats['mean'] + alpha * global_mean) / (target_stats['count'] + alpha)
            
            # Map to validation fold
            oof_encodings[val_idx] = X_val[col].map(smoothed_mean).fillna(global_mean)
            
            # For test data, use all training data for encoding
            if fold == 0:  # Only need to compute once for test
                test_target_stats = train_df.groupby(col)[target_col].agg(['mean', 'count'])
                test_smoothed = (test_target_stats['count'] * test_target_stats['mean'] + alpha * global_mean) / (test_target_stats['count'] + alpha)
                test_df[f'{col}_target_enc'] = test_df[col].map(test_smoothed).fillna(global_mean)
        
        # Assign OOF encodings to training data
        train_df[f'{col}_target_enc'] = oof_encodings
    
    return train_df, test_df

# Apply target encoding to top features: Color, Material, Brand (from analysis)
target_encode_features = ['Color', 'Material', 'Brand']
train, test = target_encode(train, test, target_encode_features, target_col)

print("Target encoding completed.")

Implementing target encoding with nested CV...


Target encoding Color...


Target encoding Material...


Target encoding Brand...


Target encoding completed.


In [8]:
# 5. Label encode all categorical features (including interactions)
print("Label encoding all categorical features...")

le_dict = {}
for col in all_cat_features:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], ignore_index=True)
    le.fit(combined.astype(str))
    
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    
    le_dict[col] = le
    print(f"Encoded {col}: {len(le.classes_)} classes")

print("Label encoding completed.")

Label encoding all categorical features...


Encoded Brand: 6 classes


Encoded Material: 5 classes


Encoded Size: 4 classes


Encoded Laptop Compartment: 3 classes


Encoded Waterproof: 3 classes


Encoded Style: 4 classes


Encoded Color: 7 classes


Encoded Brand_Size: 24 classes


Encoded Size_Color: 28 classes


Encoded Size_Style: 16 classes
Label encoding completed.


In [9]:
# Prepare final feature matrix
feature_cols = [col for col in train.columns if col not in ['id', target_col]]
X = train[feature_cols]
y = train[target_col]
X_test = test[feature_cols]

print(f"Final training features shape: {X.shape}")
print(f"Final test features shape: {X_test.shape}")
print(f"Number of features: {len(feature_cols)}")

# Show feature types
print(f"\nFeature types:")
print(f"  Weight capacity features: {len([c for c in feature_cols if 'weight' in c])}")
print(f"  Count encoding features: {len([c for c in feature_cols if '_count' in c])}")
print(f"  Target encoding features: {len([c for c in feature_cols if '_target_enc' in c])}")
print(f"  Original categoricals: {len([c for c in feature_cols if c in all_cat_features])}")
print(f"  Numerical features: {len([c for c in feature_cols if c in ['Compartments', 'Weight Capacity (kg)']])}")

Final training features shape: (3994318, 38)
Final test features shape: (200000, 38)
Number of features: 38

Feature types:
  Weight capacity features: 16
  Count encoding features: 7
  Target encoding features: 3
  Original categoricals: 10
  Numerical features: 2


In [10]:
# 6. Hyperparameter tuning (Medium Priority)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'gpu_hist',
    'device': 'cuda',
    'learning_rate': 0.05,  # Reduced from 0.1
    'max_depth': 8,         # Increased from 6
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_estimators': 2000,   # Increased from 1000
    'early_stopping_rounds': 100,  # Increased from 50
    'verbosity': 0
}

print("Optimized XGBoost parameters:")
for k, v in params.items():
    print(f"  {k}: {v}")

Optimized XGBoost parameters:
  objective: reg:squarederror
  eval_metric: rmse
  tree_method: gpu_hist
  device: cuda
  learning_rate: 0.05
  max_depth: 8
  subsample: 0.8
  colsample_bytree: 0.8
  random_state: 42
  n_estimators: 2000
  early_stopping_rounds: 100
  verbosity: 0


In [11]:
# 20-fold CV training
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

rmse_scores = []
oof_predictions = np.zeros(len(train))
test_predictions = np.zeros(len(test))

print(f"Starting {n_folds}-fold CV training with enhanced features...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold + 1}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predictions
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_pred
    
    # Accumulate test predictions
    test_predictions += test_pred / n_folds
    
    # Calculate RMSE
    fold_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    rmse_scores.append(fold_rmse)
    print(f"  Fold RMSE: {fold_rmse:.6f}")

# Overall CV score
cv_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"\n{'='*50}")
print(f"Overall CV RMSE: {cv_rmse:.6f}")
print(f"Mean Fold RMSE: {np.mean(rmse_scores):.6f} ± {np.std(rmse_scores):.6f}")
print(f"Improvement over baseline: {38.781061 - cv_rmse:.6f}")
print(f"{'='*50}")

Starting 20-fold CV training with enhanced features...
Fold 1/20


  Fold RMSE: 38.768238
Fold 2/20


  Fold RMSE: 38.797014
Fold 3/20


  Fold RMSE: 38.702667
Fold 4/20


  Fold RMSE: 38.776457
Fold 5/20


  Fold RMSE: 38.703144
Fold 6/20


  Fold RMSE: 38.781465
Fold 7/20


  Fold RMSE: 38.790562
Fold 8/20


  Fold RMSE: 38.732050
Fold 9/20


  Fold RMSE: 38.798461
Fold 10/20


  Fold RMSE: 38.770447
Fold 11/20


  Fold RMSE: 38.882109
Fold 12/20


  Fold RMSE: 38.863405
Fold 13/20


  Fold RMSE: 38.797187
Fold 14/20


  Fold RMSE: 38.814783
Fold 15/20


  Fold RMSE: 38.731059
Fold 16/20


  Fold RMSE: 38.775820
Fold 17/20


  Fold RMSE: 38.725682
Fold 18/20


  Fold RMSE: 38.852151
Fold 19/20


  Fold RMSE: 38.831710
Fold 20/20


  Fold RMSE: 38.833186

Overall CV RMSE: 38.786412
Mean Fold RMSE: 38.786380 ± 0.049658
Improvement over baseline: -0.005351


In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Price': test_predictions
})

# Clip predictions
submission['Price'] = submission['Price'].clip(lower=train[target_col].min(), upper=train[target_col].max())

print("Submission preview:")
print(submission.head())
print(f"\nSubmission statistics:")
print(submission[target_col].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")