# Baseline XGBoost Model

First baseline model using XGBoost with basic feature engineering.

Features to include:
- Original numerical features
- Log1p transformations
- Product features (Weight*Duration, Duration*Heart_Rate, Height*Weight)
- Ratio features (Weight/Height)
- BMI feature

Model: XGBoost with 500 trees, learning_rate=0.05, max_depth=6

In [12]:
# Feature engineering function
def create_features(df):
    """Create engineered features for the model"""
    df_new = df.copy()
    
    # Encode categorical features - values are 'M' and 'F'
    if 'Sex' in df_new.columns:
        df_new['Sex'] = df_new['Sex'].map({'M': 0, 'F': 1})
    
    # Original numerical features
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    
    # Log1p transformations
    for col in num_features:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    
    # Product features (from winning solutions)
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    df_new['Duration_Heart_Rate'] = df_new['Duration'] * df_new['Heart_Rate']
    df_new['Height_Weight'] = df_new['Height'] * df_new['Weight']
    
    # Ratio features
    df_new['Weight_Height'] = df_new['Weight'] / (df_new['Height'] + 1e-6)
    
    # BMI feature (Body Mass Index approximation)
    df_new['BMI'] = df_new['Weight'] / ((df_new['Height'] / 100) ** 2 + 1e-6)
    
    return df_new

# Create features for train and test
print("Creating features...")
train_feat = create_features(train_df)
test_feat = create_features(test_df)

# Define feature columns (exclude id and target)
feature_cols = [col for col in train_feat.columns if col not in ['id', 'Calories']]
print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

Creating features...
Original Sex unique values: ['M' 'F']
After mapping Sex unique values: [nan]
Original Sex unique values: ['M' 'F']
After mapping Sex unique values: [nan]
Number of features: 18
Features: ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Age_log1p', 'Height_log1p', 'Weight_log1p', 'Duration_log1p', 'Heart_Rate_log1p', 'Body_Temp_log1p', 'Weight_Duration', 'Duration_Heart_Rate', 'Height_Weight', 'Weight_Height', 'BMI']


In [13]:
# Prepare data
X = train_feat[feature_cols]
y = train_feat['Calories']
X_test = test_feat[feature_cols]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nData types:")
print(X.dtypes.head(20))
print(f"\nSex unique values in train: {train_feat['Sex'].unique()}")
print(f"Sex unique values in test: {test_feat['Sex'].unique()}")

X shape: (8000, 18)
y shape: (8000,)
X_test shape: (2000, 18)

Data types:
Sex                    float64
Age                    float64
Height                 float64
Weight                 float64
Duration               float64
Heart_Rate             float64
Body_Temp              float64
Age_log1p              float64
Height_log1p           float64
Weight_log1p           float64
Duration_log1p         float64
Heart_Rate_log1p       float64
Body_Temp_log1p        float64
Weight_Duration        float64
Duration_Heart_Rate    float64
Height_Weight          float64
Weight_Height          float64
BMI                    float64
dtype: object

Sex unique values in train: [nan]
Sex unique values in test: [nan]


In [14]:
# Prepare data
X = train_feat[feature_cols]
y = train_feat['Calories']
X_test = test_feat[feature_cols]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

X shape: (8000, 18)
y shape: (8000,)
X_test shape: (2000, 18)


In [None]:
# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
fold_scores = []

print("Starting cross-validation...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create DMatrix objects
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Define model parameters
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': SEED
    }
    
    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=[(dtrain, 'train'), (dval, 'val')],
        verbose_eval=False
    )
    
    # Predict on validation set
    val_pred = model.predict(dval)
    oof_predictions[val_idx] = val_pred
    
    # Calculate RMSLE for this fold
    # Clip predictions to avoid negative values in log
    val_pred_clipped = np.clip(val_pred, 0, None)
    fold_score = np.sqrt(mean_squared_log_error(y_val, val_pred_clipped))
    fold_scores.append(fold_score)
    
    print(f"Fold {fold + 1} RMSLE: {fold_score:.6f}")
    
    # Predict on test set
    dtest = xgb.DMatrix(X_test)
    test_pred = model.predict(dtest)
    test_predictions += test_pred / 5  # Average across folds

# Calculate overall CV score
oof_predictions_clipped = np.clip(oof_predictions, 0, None)
cv_score = np.sqrt(mean_squared_log_error(y, oof_predictions_clipped))

print(f"\n{'='*50}")
print(f"Cross-validation RMSLE: {cv_score:.6f}")
print(f"Fold scores: {fold_scores}")
print(f"Mean ± Std: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
print(f"{'='*50}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': test_predictions
})

# Clip predictions to training data range to prevent unrealistic values
train_min = train_df['Calories'].min()
train_max = train_df['Calories'].max()
submission['Calories'] = submission['Calories'].clip(train_min, train_max)

print(f"Submission shape: {submission.shape}")
print(f"Calories range in submission: [{submission['Calories'].min():.2f}, {submission['Calories'].max():.2f}]")
print(f"Calories range in training: [{train_min:.2f}, {train_max:.2f}]")

# Save submission
submission_path = '/home/code/submission_candidates/candidate_001_baseline_xgboost.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Save OOF predictions for ensemble
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'Calories': oof_predictions
})
oof_path = '/home/code/experiments/oof_001_baseline_xgboost.csv'
oof_df.to_csv(oof_path, index=False)
print(f"OOF predictions saved to: {oof_path}")