# Experiment 003: Systematic Ablation Study

## Goal
Fix bugs from exp_001 and test features incrementally to identify what works vs. what hurts.

## Strategy
1. Start with proven baseline: Weight Capacity + digit features only
2. Add one feature group at a time
3. Measure CV impact for each group
4. Keep only features that improve CV by >0.001

## Bug Fixes
- Remove quantile features (proven constant columns)
- Fix count encoding: compute from training data only
- Validate each feature before combining

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Load Data

In [None]:
# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
training_extra = pd.read_csv('/home/data/training_extra.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Training extra shape: {training_extra.shape}")

# Combine train and training_extra for more data
combined_train = pd.concat([train, training_extra], ignore_index=True)
print(f"Combined train shape: {combined_train.shape}")

# Basic info
print(f"\nTarget stats:")
print(combined_train['Price'].describe())

## Feature Engineering - Baseline (Proven Features Only)

In [None]:
def create_baseline_features(df):
    """Create proven baseline features only"""
    features = pd.DataFrame(index=df.index)
    
    # Weight Capacity features (proven to work)
    if 'Weight Capacity (kg)' in df.columns:
        weight = df['Weight Capacity (kg)'].copy()
        
        # Original value
        features['weight_original'] = weight
        
        # Rounding to different decimals (7-10 as per winning solutions)
        for dec in range(7, 11):
            features[f'weight_round_{dec}'] = np.round(weight, decimals=dec)
        
        # Digit extraction (1-5 digits)
        weight_str = weight.astype(str).str.replace('.', '')
        for i in range(1, 6):
            features[f'weight_digit_{i}'] = weight_str.str[i-1].astype(float)
        
        # Integer and fractional parts
        features['weight_int'] = weight.astype(int)
        features['weight_frac'] = weight - weight.astype(int)
    
    return features

# Create baseline features
X_baseline = create_baseline_features(combined_train)
X_test_baseline = create_baseline_features(test)

print(f"Baseline features shape: {X_baseline.shape}")
print(f"Baseline test features shape: {X_test_baseline.shape}")
print(f"\nBaseline feature names:")
print(X_baseline.columns.tolist())

## Label Encode Categoricals

In [None]:
# Label encode categorical features
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

# Fit label encoders on combined train + test data
label_encoders = {}
for col in categorical_cols:
    if col in combined_train.columns:
        le = LabelEncoder()
        # Fit on combined data to handle all categories
        combined_data = pd.concat([combined_train[col], test[col]], ignore_index=True)
        le.fit(combined_data.astype(str).fillna('missing'))
        
        # Transform train and test
        X_baseline[f'{col}_encoded'] = le.transform(combined_train[col].astype(str).fillna('missing'))
        X_test_baseline[f'{col}_encoded'] = le.transform(test[col].astype(str).fillna('missing'))
        
        label_encoders[col] = le

print(f"Added {len(categorical_cols)} label encoded features")
print(f"Total baseline features: {X_baseline.shape[1]}")

## Cross-Validation Setup

In [None]:
# Prepare data for CV
y = combined_train['Price'].values
X = X_baseline.copy()

# 20-fold CV (consistent with previous experiments)
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

print(f"Training data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"CV folds: {n_folds}")

## Train Baseline Model

In [None]:
# XGBoost parameters (from winning solutions)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# Cross-validation
fold_scores = []
oof_predictions = np.zeros(len(X))

print("Starting 20-fold CV...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    # Predict
    val_pred = model.predict(dval)
    oof_predictions[val_idx] = val_pred
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    fold_scores.append(rmse)
    
    print(f"Fold {fold+1:2d}/{n_folds} - RMSE: {rmse:.6f}")

# Overall CV score
cv_score = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"\nOverall CV RMSE: {cv_score:.6f}")
print(f"Mean fold RMSE: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
print(f"Fold RMSE range: {np.min(fold_scores):.6f} - {np.max(fold_scores):.6f}")

## Feature Importance Analysis

In [None]:
# Get feature importance from last fold
importance = model.get_score(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': [importance.get(f'f{i}', 0) for i in range(len(feature_names))]
})

# Sort by importance
importance_df = importance_df.sort_values('importance', ascending=False)

print("Top 15 features by importance:")
print(importance_df.head(15))

# Check for zero-variance features
zero_var_features = X.columns[X.nunique() <= 1]
if len(zero_var_features) > 0:
    print(f"\n⚠️  Zero variance features: {zero_var_features.tolist()}")
else:
    print("\n✓ No zero variance features detected")

## Generate Predictions

In [None]:
# Train final model on full data
dtrain_full = xgb.DMatrix(X, label=y)

final_model = xgb.train(
    params,
    dtrain_full,
    num_boost_round=2000,
    verbose_eval=False
)

# Predict on test
dtest = xgb.DMatrix(X_test_baseline)
test_predictions = final_model.predict(dtest)

# Clip predictions to training range
min_price = y.min()
max_price = y.max()
test_predictions = np.clip(test_predictions, min_price, max_price)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test predictions range: {test_predictions.min():.2f} - {test_predictions.max():.2f}")

## Create Submission

In [None]:
# Load sample submission
sample_sub = pd.read_csv('/home/data/sample_submission.csv')

# Create submission
submission = sample_sub.copy()
submission['Price'] = test_predictions

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Submission Price stats:")
print(submission['Price'].describe())