# Baseline XGBoost Model

Simple baseline following the competition strategy:
- Load train.csv + training_extra.csv
- Basic preprocessing (label encoding)
- Simple Weight Capacity features
- XGBoost with GPU acceleration
- 20-fold CV
- Generate submission

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Load data
print("Loading training data...")
train1 = pd.read_csv('/home/data/train.csv')
train2 = pd.read_csv('/home/data/training_extra.csv')
train = pd.concat([train1, train2], ignore_index=True)

print(f"Combined training shape: {train.shape}")
print(f"Columns: {list(train.columns)}")

# Load test data
test = pd.read_csv('/home/data/test.csv')
print(f"Test shape: {test.shape}")

In [None]:
# Basic EDA
print("Target statistics:")
print(train['Price'].describe())

print("\nMissing values in train:")
print(train.isnull().sum())

print("\nMissing values in test:")
print(test.isnull().sum())

In [None]:
# Identify feature types
cat_features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
num_features = ['Compartments', 'Weight Capacity (kg)']

print(f"Categorical features: {cat_features}")
print(f"Numerical features: {num_features}")

In [None]:
# Basic preprocessing - label encode categoricals
print("Label encoding categorical features...")
le_dict = {}

for col in cat_features:
    le = LabelEncoder()
    # Fit on combined train + test to handle unseen categories
    combined = pd.concat([train[col], test[col]], ignore_index=True)
    le.fit(combined.astype(str))
    
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    
    le_dict[col] = le
    print(f"Encoded {col}: {len(le.classes_)} classes")

print("\nPreprocessing complete!")

In [None]:
# Create basic features from Weight Capacity (kg) - most important feature per winning solutions
print("Creating Weight Capacity features...")

def create_weight_capacity_features(df):
    """Create features from Weight Capacity (kg) - the most important feature"""
    df = df.copy()
    
    # Round to different decimal places
    for dec in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
        df[f'weight_round_{dec}'] = df['Weight Capacity (kg)'].round(dec)
    
    # Extract digits
    for k in range(1, 6):
        df[f'weight_digit_{k}'] = ((df['Weight Capacity (kg)'] * 10**k) % 10).fillna(-1)
    
    # Basic stats
    df['weight_int'] = df['Weight Capacity (kg)'].astype(int)
    df['weight_frac'] = df['Weight Capacity (kg)'] - df['weight_int']
    
    return df

train = create_weight_capacity_features(train)
test = create_weight_capacity_features(test)

print(f"New shape after feature engineering: {train.shape}")
print(f"New features: {[col for col in train.columns if 'weight' in col.lower()]}")

In [None]:
# Prepare data for training
feature_cols = [col for col in train.columns if col not in ['id', 'Price']]
X = train[feature_cols]
y = train['Price']
X_test = test[feature_cols]

print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Number of features: {len(feature_cols)}")

In [None]:
# XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'gpu_hist',  # Use GPU
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'verbosity': 0
}

print("XGBoost parameters:")
for k, v in params.items():
    print(f"  {k}: {v}")

In [None]:
# 20-fold CV as specified in winning strategies
n_folds = 20
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

rmse_scores = []
oof_predictions = np.zeros(len(train))
test_predictions = np.zeros(len(test))

print(f"Starting {n_folds}-fold CV training...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold + 1}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predictions
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_pred
    
    # Accumulate test predictions
    test_predictions += test_pred / n_folds
    
    # Calculate RMSE
    fold_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    rmse_scores.append(fold_rmse)
    print(f"  Fold RMSE: {fold_rmse:.6f}")

# Overall CV score
cv_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"\nOverall CV RMSE: {cv_rmse:.6f}")
print(f"Mean Fold RMSE: {np.mean(rmse_scores):.6f} Â± {np.std(rmse_scores):.6f}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Price': test_predictions
})

# Clip predictions to reasonable range (based on target distribution)
submission['Price'] = submission['Price'].clip(lower=train['Price'].min(), upper=train['Price'].max())

print("Submission preview:")
print(submission.head())
print(f"\nSubmission statistics:")
print(submission['Price'].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")