# Experiment 006: Original Dataset + COMBO Features

**Strategy:** Implement the evolved strategy focusing on:
1. Original dataset features (orig_price, orig_price_r7, orig_price_r8, orig_price_r9)
2. COMBO/interaction features (NaN encoding, NaN × Weight Capacity, categorical × Weight Capacity)
3. Optimize groupby statistics (keep mean, count, median; add skew, kurtosis, percentiles)
4. Remove histogram bins (they hurt performance)
5. Hyperparameter refinement

**Expected improvement:** -0.148 RMSE total, targeting 38.512840 CV

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
submission = pd.read_csv('/home/data/sample_submission.csv')

# Load original dataset
original = pd.read_csv('/home/code/original_dataset/Noisy_Student_Bag_Price_Prediction_Dataset.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Original dataset shape: {original.shape}")
print(f"\nOriginal dataset columns: {original.columns.tolist()}")
print(f"\nOriginal dataset sample:")
print(original.head())

Train shape: (300000, 11)
Test shape: (200000, 10)
Original dataset shape: (52500, 10)

Original dataset columns: ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'Price']

Original dataset sample:
          Brand Material   Size  Compartments Laptop Compartment Waterproof  \
0      Jansport    Nylon  Small           2.0                 No        Yes   
1  Under Armour    Nylon  Large           4.0                Yes        Yes   
2          Nike    Nylon  Large           NaN                 No        Yes   
3          Nike    Nylon  Small           1.0                Yes         No   
4  Under Armour  Leather  Small           8.0                Yes         No   

       Style  Color  Weight Capacity (kg)       Price  
0   Backpack  Green             13.340058  143.445135  
1       Tote   Pink              5.918030   72.086319  
2  Messenger    Red             24.088386   29.699631  
3  Messenger   Pink         

## Step 1: Compute Original Dataset Features

Following the winning solution, compute:
- orig_price: mean Price by Weight Capacity
- orig_price_r7, orig_price_r8, orig_price_r9: mean Price by rounded Weight Capacity (7, 8, 9 decimals)

In [2]:
# Extract Weight Capacity from original dataset
# The original dataset has 'Weight Capacity (kg)' column
original['weight_capacity'] = original['Weight Capacity (kg)'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

# Compute orig_price: mean Price by Weight Capacity
orig_price = original.groupby('weight_capacity')['Price'].mean().reset_index()
orig_price.columns = ['weight_capacity', 'orig_price']

# Compute rounded versions
for decimals in [7, 8, 9]:
    col_name = f'weight_capacity_r{decimals}'
    orig_price[col_name] = orig_price['weight_capacity'].round(decimals)
    
    # Compute mean by rounded weight capacity
    temp = original.copy()
    temp['weight_rounded'] = temp['weight_capacity'].round(decimals)
    rounded_price = temp.groupby('weight_rounded')['Price'].mean().reset_index()
    rounded_price.columns = [col_name, f'orig_price_r{decimals}']
    
    # Merge back
    orig_price = orig_price.merge(rounded_price, on=col_name, how='left')

print("Original dataset features computed:")
print(orig_price.head(10))
print(f"\nShape: {orig_price.shape}")
print(f"NaN counts: {orig_price.isnull().sum().sum()}")

Original dataset features computed:
   weight_capacity  orig_price  weight_capacity_r7  orig_price_r7  \
0         5.000000   80.693646            5.000000      80.693646   
1         5.001061   93.862638            5.001061      93.862638   
2         5.004444  130.627948            5.004444     130.627948   
3         5.004837   76.920155            5.004837      76.920155   
4         5.005468  101.682464            5.005468     101.682464   
5         5.005485   66.103371            5.005485      66.103371   
6         5.006887   83.136924            5.006887      83.136924   
7         5.008382  133.934556            5.008382     133.934556   
8         5.009264  104.682819            5.009264     104.682819   
9         5.009461   75.432553            5.009461      75.432553   

   weight_capacity_r8  orig_price_r8  weight_capacity_r9  orig_price_r9  
0            5.000000      80.693646            5.000000      80.693646  
1            5.001061      93.862638            5.001061

## Step 2: Extract Weight Capacity from Current Dataset

In [3]:
# Extract weight capacity from current dataset
def extract_weight_capacity(df):
    """Extract numeric weight capacity from Weight Capacity column"""
    df = df.copy()
    df['weight_capacity'] = df['Weight Capacity (kg)'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
    
    # Create rounded versions
    for decimals in [7, 8, 9]:
        df[f'weight_capacity_r{decimals}'] = df['weight_capacity'].round(decimals)
    
    return df

train = extract_weight_capacity(train)
test = extract_weight_capacity(test)

print("Weight capacity extracted:")
print(train[['Weight Capacity (kg)', 'weight_capacity', 'weight_capacity_r7', 'weight_capacity_r8', 'weight_capacity_r9']].head())

Weight capacity extracted:
   Weight Capacity (kg)  weight_capacity  weight_capacity_r7  \
0             11.611723        11.611723           11.611723   
1             27.078537        27.078537           27.078537   
2             16.643760        16.643760           16.643760   
3             12.937220        12.937220           12.937220   
4             17.749338        17.749338           17.749339   

   weight_capacity_r8  weight_capacity_r9  
0           11.611723           11.611723  
1           27.078537           27.078537  
2           16.643760           16.643760  
3           12.937220           12.937220  
4           17.749338           17.749338  


## Step 3: Merge Original Dataset Features

In [4]:
# Merge original dataset features
def merge_orig_features(df, orig_features):
    """Merge original dataset price features"""
    df = df.copy()
    
    # Merge by exact weight capacity
    df = df.merge(orig_features[['weight_capacity', 'orig_price']], on='weight_capacity', how='left')
    
    # Merge by rounded weight capacity
    for decimals in [7, 8, 9]:
        df = df.merge(
            orig_features[['weight_capacity_r' + str(decimals), 'orig_price_r' + str(decimals)]],
            on='weight_capacity_r' + str(decimals),
            how='left'
        )
    
    return df

train = merge_orig_features(train, orig_price)
test = merge_orig_features(test, orig_price)

print("Original features merged:")
print(train[['weight_capacity', 'orig_price', 'orig_price_r7', 'orig_price_r8', 'orig_price_r9']].head())
print(f"\nNaN counts in original features:")
print(train[['orig_price', 'orig_price_r7', 'orig_price_r8', 'orig_price_r9']].isnull().sum())

Original features merged:
   weight_capacity  orig_price  orig_price_r7  orig_price_r8  orig_price_r9
0        11.611723   39.770555      39.770555      39.770555      39.770555
1        27.078537         NaN            NaN            NaN            NaN
2        16.643760         NaN            NaN            NaN            NaN
3        12.937220         NaN            NaN            NaN            NaN
4        17.749338         NaN            NaN            NaN            NaN

NaN counts in original features:
orig_price       163393
orig_price_r7    148490
orig_price_r8    151921
orig_price_r9    154435
dtype: int64


## Step 4: Create COMBO/Interaction Features

Following winning solution pattern:
- NaNs: Base-2 encoding of all NaN patterns
- {col}_nan_wc: NaN status × Weight Capacity for each of 7 categorical columns
- {col}_wc: Factorized categorical × Weight Capacity for each of 7 columns

In [5]:
# Define categorical columns
cat_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

# Step 4a: Base-2 NaN encoding
def create_nan_encoding(df):
    """Create base-2 encoding of NaN patterns"""
    df = df.copy()
    
    # Create binary pattern for NaN status
    nan_pattern = 0
    for i, col in enumerate(cat_cols):
        is_nan = df[col].isna().astype(int)
        nan_pattern += is_nan * (2 ** i)
    
    df['NaNs'] = nan_pattern
    return df

train = create_nan_encoding(train)
test = create_nan_encoding(test)

print("NaN encoding created:")
print(train['NaNs'].value_counts().head())

NaN encoding created:


NaNs
0     246734
64      8370
1       8313
2       7277
32      7011
Name: count, dtype: int64


In [6]:
# Step 4b: {col}_nan_wc features (NaN status × Weight Capacity)
for col in cat_cols:
    train[f'{col}_nan_wc'] = train[col].isna().astype(int) * train['weight_capacity']
    test[f'{col}_nan_wc'] = test[col].isna().astype(int) * test['weight_capacity']

print("NaN × Weight Capacity features created")
print(f"New features: {[f'{col}_nan_wc' for col in cat_cols]}")

NaN × Weight Capacity features created
New features: ['Brand_nan_wc', 'Material_nan_wc', 'Size_nan_wc', 'Laptop Compartment_nan_wc', 'Waterproof_nan_wc', 'Style_nan_wc', 'Color_nan_wc']


In [7]:
# Step 4c: {col}_wc features (factorized categorical × Weight Capacity)
# First, factorize categoricals (handle NaNs)
for col in cat_cols:
    # Combine train and test for consistent factorization
    combined = pd.concat([train[col], test[col]], axis=0)
    
    # Factorize (NaN becomes -1, we add 1 to make it 0)
    codes, categories = pd.factorize(combined, sort=True)
    codes = codes + 1  # Shift so NaN becomes 0 instead of -1
    
    # Split back
    train[f'{col}_factorized'] = codes[:len(train)]
    test[f'{col}_factorized'] = codes[len(train):]
    
    # Create interaction with weight capacity
    train[f'{col}_wc'] = train[f'{col}_factorized'] * train['weight_capacity']
    test[f'{col}_wc'] = test[f'{col}_factorized'] * test['weight_capacity']

print("Categorical × Weight Capacity features created")
print(f"Sample: {train[['Brand_wc', 'Material_wc', 'Size_wc']].head()}")

Categorical × Weight Capacity features created
Sample:     Brand_wc  Material_wc    Size_wc
0  23.223446    23.223446  23.223446
1  54.157073    27.078537  81.235610
2  83.218800    33.287520  49.931280
3  38.811661    38.811661  38.811661
4  17.749338    17.749338  35.498677


In [11]:
from sklearn.model_selection import KFold
import scipy.stats as stats

# Define group keys (8 total: weight_capacity + 7 categoricals)
group_keys = ['weight_capacity'] + cat_cols

# Statistics to compute - simplify to avoid errors with small groups
stats_to_compute = ['mean', 'count', 'median']

def compute_groupby_stats(train_df, test_df, target_col='Price'):
    """Compute groupby statistics with simple approach"""
    
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    for group_key in group_keys:
        print(f"Computing stats for group key: {group_key}")
        
        # Initialize columns for each statistic
        for stat in stats_to_compute:
            train_df[f'{group_key}_{stat}_price'] = np.nan
            test_df[f'{group_key}_{stat}_price'] = np.nan
        
        # Compute statistics for each group
        if group_key == 'weight_capacity':
            # For numeric weight_capacity, use simple groupby
            grouped = train_df.groupby(group_key)[target_col].agg(['mean', 'count', 'median'])
            
            # Map back to train and test
            for stat in stats_to_compute:
                train_df[f'{group_key}_{stat}_price'] = train_df[group_key].map(grouped[stat])
                test_df[f'{group_key}_{stat}_price'] = test_df[group_key].map(grouped[stat])
        else:
            # For categorical columns
            # Factorize first to handle NaNs and get numeric codes
            combined = pd.concat([train_df[group_key], test_df[group_key]], axis=0)
            codes, _ = pd.factorize(combined, sort=True)
            codes = codes + 1  # Make NaN = 0
            
            train_df[f'{group_key}_code'] = codes[:len(train_df)]
            test_df[f'{group_key}_code'] = codes[len(train_df):]
            
            # Group by the code and compute statistics
            grouped = train_df.groupby(f'{group_key}_code')[target_col].agg(['mean', 'count', 'median'])
            
            # Map back
            for stat in stats_to_compute:
                train_df[f'{group_key}_{stat}_price'] = train_df[f'{group_key}_code'].map(grouped[stat])
                test_df[f'{group_key}_{stat}_price'] = test_df[f'{group_key}_code'].map(grouped[stat])
            
            # Clean up temporary code column
            train_df.drop(columns=[f'{group_key}_code'], inplace=True)
            test_df.drop(columns=[f'{group_key}_code'], inplace=True)
    
    return train_df, test_df

# Execute the groupby statistics computation
train, test = compute_groupby_stats(train, test)

print("Groupby statistics computed")
print(f"New features added: {len([col for col in train.columns if '_price' in col and col != 'Price'])}")
print(f"Sample features: {train.columns[-8:].tolist()}")

Computing stats for group key: weight_capacity
Computing stats for group key: Brand
Computing stats for group key: Material


Computing stats for group key: Size
Computing stats for group key: Laptop Compartment


Computing stats for group key: Waterproof
Computing stats for group key: Style


Computing stats for group key: Color
Groupby statistics computed
New features added: 28
Sample features: ['Waterproof_count_price', 'Waterproof_median_price', 'Style_mean_price', 'Style_count_price', 'Style_median_price', 'Color_mean_price', 'Color_count_price', 'Color_median_price']


In [None]:
# Execute the groupby statistics computation from cell 12
train, test = compute_groupby_stats(train, test)

print("Groupby statistics computed")
print(f"New features added: {len([col for col in train.columns if '_price' in col and col != 'Price'])}")
print(f"Sample features: {train.columns[-8:].tolist()}")

## Step 6: Prepare Final Feature Set

Remove histogram bins (not in winning solution) and prepare features for modeling.

In [None]:
# Get all feature columns (exclude target, ID, and raw categorical columns)
exclude_cols = ['Price', 'id', 'Weight Capacity', 'weight_capacity_r7', 'weight_capacity_r8', 'weight_capacity_r9']
exclude_cols += cat_cols  # Exclude raw categorical columns (they're object type)

feature_cols = [col for col in train.columns if col not in exclude_cols]

print(f"Total features before selection: {len(feature_cols)}")

# Count feature types
orig_features = [col for col in feature_cols if 'orig_price' in col]
combo_features = [col for col in feature_cols if '_nan_wc' in col or '_wc' in col or col == 'NaNs']
groupby_features = [col for col in feature_cols if '_price' in col and col != 'Price']
other_features = [col for col in feature_cols if col not in orig_features + combo_features + groupby_features]

print(f"\nFeature breakdown:")
print(f"- Original dataset features: {len(orig_features)}")
print(f"- COMBO/interaction features: {len(combo_features)}")
print(f"- Groupby statistics: {len(groupby_features)}")
print(f"- Other features: {len(other_features)}")
print(f"\nTotal selected features: {len(feature_cols)}")
print(f"\nSample features: {feature_cols[:10]}")

## Step 7: Model Training with Optimized Hyperparameters

In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import time

# Prepare data
X_train = train[feature_cols].copy()
y_train = train['Price'].copy()
X_test = test[feature_cols].copy()

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Hyperparameters (optimized per strategy)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda',
    'learning_rate': 0.03,  # Reduced from 0.05
    'max_depth': 10,        # Increased from 8
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,       # Added L1 regularization
    'reg_lambda': 1.0,      # Added L2 regularization
    'random_state': 42
}

# Cross-validation setup
n_folds = 20
time_budget = 3600  # 1 hour in seconds
start_time = time.time()

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
rmse_scores = []
models = []

print(f"Starting {n_folds}-fold CV training...")
print(f"Time budget: {time_budget} seconds")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    # Check time budget
    elapsed = time.time() - start_time
    if elapsed > time_budget * 0.8:  # Stop if 80% of time used
        print(f"Stopping early due to time budget. Completed {fold} folds.")
        break
    
    print(f"Fold {fold+1}/{n_folds}...")
    
    # Split data
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    # Predict and evaluate
    val_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    rmse_scores.append(rmse)
    
    print(f"  Fold {fold+1} RMSE: {rmse:.6f}")
    
    # Store model
    models.append(model)

print(f"\nCV completed. Scores: {rmse_scores}")
print(f"Mean RMSE: {np.mean(rmse_scores):.6f} ± {np.std(rmse_scores):.6f}")

## Step 8: Generate Predictions and Submission

In [None]:
# Generate predictions using ensemble of all folds
print("Generating predictions...")

dtest = xgb.DMatrix(X_test)
predictions = np.zeros(len(X_test))

for model in models:
    predictions += model.predict(dtest)

# Average predictions
predictions /= len(models)

# Create submission - ensure we use the correct index
submission_df = pd.DataFrame({
    'id': test['id'],
    'Price': predictions
})

submission_df.to_csv('/home/code/submission_candidates/candidate_006.csv', index=False)

print(f"Submission saved to /home/code/submission_candidates/candidate_006.csv")
print(f"Predictions shape: {predictions.shape}")
print(f"Prediction range: [{predictions.min():.2f}, {predictions.max():.2f}]")
print(f"Prediction mean: {predictions.mean():.2f}")

# Feature importance analysis
print("\nTop 20 features by importance:")
importance_dict = models[0].get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
})
importance_df = importance_df.sort_values('importance', ascending=False)
print(importance_df.head(20))