# Baseline XGBoost Model

Simple baseline following the competition strategy:
- Load train.csv + training_extra.csv
- Basic preprocessing (label encoding)
- Simple Weight Capacity features
- XGBoost with GPU acceleration
- 20-fold CV
- Generate submission

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Load data
print("Loading training data...")
train1 = pd.read_csv('/home/data/train.csv')
train2 = pd.read_csv('/home/data/training_extra.csv')
train = pd.concat([train1, train2], ignore_index=True)

print(f"Combined training shape: {train.shape}")
print(f"Columns: {list(train.columns)}")

# Load test data
test = pd.read_csv('/home/data/test.csv')
print(f"Test shape: {test.shape}")

Loading training data...


Combined training shape: (3994318, 11)
Columns: ['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'Price']
Test shape: (200000, 10)


In [3]:
# Basic EDA
print("Target statistics:")
print(train['Price'].describe())

print("\nMissing values in train:")
print(train.isnull().sum())

print("\nMissing values in test:")
print(test.isnull().sum())

Target statistics:


count    3.994318e+06
mean     8.136217e+01
std      3.893868e+01
min      1.500000e+01
25%      4.747002e+01
50%      8.098495e+01
75%      1.148550e+02
max      1.500000e+02
Name: Price, dtype: float64

Missing values in train:


id                           0
Brand                   126758
Material                110962
Size                     87785
Compartments                 0
Laptop Compartment       98533
Waterproof               94324
Style                   104180
Color                   133617
Weight Capacity (kg)      1808
Price                        0
dtype: int64

Missing values in test:
id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64


In [4]:
# Identify feature types
cat_features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
num_features = ['Compartments', 'Weight Capacity (kg)']

print(f"Categorical features: {cat_features}")
print(f"Numerical features: {num_features}")

Categorical features: ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
Numerical features: ['Compartments', 'Weight Capacity (kg)']


In [5]:
# Basic preprocessing - label encode categoricals and handle missing values
print("Label encoding categorical features and handling missing values...")
le_dict = {}

# Handle missing values first
for col in cat_features:
    # Fill missing values with a placeholder
    train[col] = train[col].fillna('Missing')
    test[col] = test[col].fillna('Missing')
    
    le = LabelEncoder()
    # Fit on combined train + test to handle unseen categories
    combined = pd.concat([train[col], test[col]], ignore_index=True)
    le.fit(combined.astype(str))
    
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    
    le_dict[col] = le
    print(f"Encoded {col}: {len(le.classes_)} classes")

# Handle numerical missing values
train['Compartments'] = train['Compartments'].fillna(train['Compartments'].median())
test['Compartments'] = test['Compartments'].fillna(train['Compartments'].median())

train['Weight Capacity (kg)'] = train['Weight Capacity (kg)'].fillna(train['Weight Capacity (kg)'].median())
test['Weight Capacity (kg)'] = test['Weight Capacity (kg)'].fillna(train['Weight Capacity (kg)'].median())

print("\nPreprocessing complete!")

Label encoding categorical features and handling missing values...


Encoded Brand: 6 classes


Encoded Material: 5 classes


Encoded Size: 4 classes


Encoded Laptop Compartment: 3 classes


Encoded Waterproof: 3 classes


Encoded Style: 4 classes


Encoded Color: 7 classes



Preprocessing complete!


In [6]:
# Create basic features from Weight Capacity (kg) - most important feature per winning solutions
print("Creating Weight Capacity features...")

def create_weight_capacity_features(df):
    """Create features from Weight Capacity (kg) - the most important feature"""
    df = df.copy()
    
    # Ensure no missing values
    df['Weight Capacity (kg)'] = df['Weight Capacity (kg)'].fillna(df['Weight Capacity (kg)'].median())
    
    # Round to different decimal places
    for dec in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
        df[f'weight_round_{dec}'] = df['Weight Capacity (kg)'].round(dec)
    
    # Extract digits
    for k in range(1, 6):
        df[f'weight_digit_{k}'] = ((df['Weight Capacity (kg)'] * 10**k) % 10).fillna(-1)
    
    # Basic stats - handle edge cases
    df['weight_int'] = df['Weight Capacity (kg)'].astype(int)
    df['weight_frac'] = df['Weight Capacity (kg)'] - df['weight_int']
    
    return df

train = create_weight_capacity_features(train)
test = create_weight_capacity_features(test)

print(f"New shape after feature engineering: {train.shape}")
print(f"New features: {[col for col in train.columns if 'weight' in col.lower()]}")

Creating Weight Capacity features...


New shape after feature engineering: (3994318, 28)
New features: ['Weight Capacity (kg)', 'weight_round_1', 'weight_round_2', 'weight_round_3', 'weight_round_4', 'weight_round_5', 'weight_round_6', 'weight_round_7', 'weight_round_8', 'weight_round_9', 'weight_round_10', 'weight_digit_1', 'weight_digit_2', 'weight_digit_3', 'weight_digit_4', 'weight_digit_5', 'weight_int', 'weight_frac']


In [7]:
# Prepare data for training
feature_cols = [col for col in train.columns if col not in ['id', 'Price']]
X = train[feature_cols]
y = train['Price']
X_test = test[feature_cols]

print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Number of features: {len(feature_cols)}")

Training features shape: (3994318, 26)
Test features shape: (200000, 26)
Number of features: 26


In [8]:
# XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'gpu_hist',  # Use GPU
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'verbosity': 0
}

print("XGBoost parameters:")
for k, v in params.items():
    print(f"  {k}: {v}")

XGBoost parameters:
  objective: reg:squarederror
  eval_metric: rmse
  tree_method: gpu_hist
  device: cuda
  learning_rate: 0.1
  max_depth: 6
  subsample: 0.8
  colsample_bytree: 0.8
  random_state: 42
  n_estimators: 1000
  early_stopping_rounds: 50
  verbosity: 0


In [10]:
# 20-fold CV as specified in winning strategies
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

rmse_scores = []
oof_predictions = np.zeros(len(train))
test_predictions = np.zeros(len(test))

print(f"Starting {n_folds}-fold CV training...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold + 1}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predictions
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_pred
    
    # Accumulate test predictions
    test_predictions += test_pred / n_folds
    
    # Calculate RMSE
    fold_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    rmse_scores.append(fold_rmse)
    print(f"  Fold RMSE: {fold_rmse:.6f}")

# Overall CV score
cv_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"\nOverall CV RMSE: {cv_rmse:.6f}")
print(f"Mean Fold RMSE: {np.mean(rmse_scores):.6f} ± {np.std(rmse_scores):.6f}")

Starting 20-fold CV training...
Fold 1/20


  Fold RMSE: 38.766019
Fold 2/20


  Fold RMSE: 38.791439
Fold 3/20


  Fold RMSE: 38.697766
Fold 4/20


  Fold RMSE: 38.771409
Fold 5/20


  Fold RMSE: 38.699789
Fold 6/20


  Fold RMSE: 38.778115
Fold 7/20


  Fold RMSE: 38.790089
Fold 8/20


  Fold RMSE: 38.732571
Fold 9/20


  Fold RMSE: 38.786091
Fold 10/20


  Fold RMSE: 38.762629
Fold 11/20


  Fold RMSE: 38.871318
Fold 12/20


  Fold RMSE: 38.853336
Fold 13/20


  Fold RMSE: 38.789804
Fold 14/20


  Fold RMSE: 38.800621
Fold 15/20


  Fold RMSE: 38.731473
Fold 16/20


  Fold RMSE: 38.769511
Fold 17/20


  Fold RMSE: 38.720888
Fold 18/20


  Fold RMSE: 38.848251
Fold 19/20


  Fold RMSE: 38.832048
Fold 20/20


  Fold RMSE: 38.827451

Overall CV RMSE: 38.781061
Mean Fold RMSE: 38.781031 ± 0.048003


In [11]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Price': test_predictions
})

# Clip predictions to reasonable range (based on target distribution)
submission['Price'] = submission['Price'].clip(lower=train['Price'].min(), upper=train['Price'].max())

print("Submission preview:")
print(submission.head())
print(f"\nSubmission statistics:")
print(submission['Price'].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

Submission preview:
       id      Price
0  300000  79.434960
1  300001  84.390175
2  300002  87.350705
3  300003  77.083186
4  300004  83.603492

Submission statistics:
count    200000.000000
mean         81.362155
std           3.303805
min          31.472881
25%          79.578759
50%          81.418643
75%          83.254399
max         107.692375
Name: Price, dtype: float64



Submission saved to: /home/submission/submission.csv


In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Price': test_predictions
})

# Clip predictions to reasonable range (based on target distribution)
submission['Price'] = submission['Price'].clip(lower=train['Price'].min(), upper=train['Price'].max())

print("Submission preview:")
print(submission.head())
print(f"\nSubmission statistics:")
print(submission['Price'].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")