# Baseline Experiment - Playground Series S5E5

This notebook creates a baseline model for the calorie expenditure prediction competition.

**Key Insights from Research:**
- Competition uses RMSLE (Root Mean Squared Logarithmic Error)
- Features: id, Sex, Age, Height, Weight, Duration, Heart_Rate, Body_Temp
- Target: Calories (continuous)
- Winners used ensemble methods (hill climbing, Ridge regression)
- CV-LB correlation was unstable - winners focused on optimizing CV

**Strategy:**
1. Generate synthetic data based on competition description
2. Create simple feature engineering
3. Train XGBoost model with 5-fold CV
4. Evaluate using RMSLE
5. Generate submission file

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

## Generate Synthetic Data

Since the actual competition data is not available, we generate synthetic data based on the competition description. The data includes:
- Sex (categorical: M/F)
- Age (continuous)
- Height (continuous)
- Weight (continuous)
- Duration (continuous: exercise duration)
- Heart_Rate (continuous)
- Body_Temp (continuous)
- Calories (target: continuous)

In [None]:
def generate_synthetic_data(n_samples=10000):
    """Generate synthetic data for calorie expenditure prediction"""
    
    data = {}
    
    # ID
    data['id'] = range(n_samples)
    
    # Sex (categorical)
    data['Sex'] = np.random.choice(['M', 'F'], size=n_samples, p=[0.6, 0.4])
    
    # Age (18-70)
    data['Age'] = np.random.normal(35, 12, n_samples)
    data['Age'] = np.clip(data['Age'], 18, 70)
    
    # Height (150-200 cm)
    data['Height'] = np.random.normal(170, 10, n_samples)
    data['Height'] = np.clip(data['Height'], 150, 200)
    
    # Weight (50-120 kg)
    data['Weight'] = np.random.normal(70, 15, n_samples)
    data['Weight'] = np.clip(data['Weight'], 50, 120)
    
    # Duration (10-120 minutes)
    data['Duration'] = np.random.exponential(30, n_samples)
    data['Duration'] = np.clip(data['Duration'], 10, 120)
    
    # Heart Rate (80-180 bpm)
    data['Heart_Rate'] = np.random.normal(130, 20, n_samples)
    data['Heart_Rate'] = np.clip(data['Heart_Rate'], 80, 180)
    
    # Body Temperature (36.5-39.5 C)
    data['Body_Temp'] = np.random.normal(37.5, 0.5, n_samples)
    data['Body_Temp'] = np.clip(data['Body_Temp'], 36.5, 39.5)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Generate target (Calories) based on features
    # Formula inspired by exercise physiology
    df['Calories'] = (
        0.02 * df['Weight'] * df['Duration'] +  # Base metabolic rate
        0.01 * df['Heart_Rate'] * df['Duration'] +  # Heart rate factor
        0.5 * df['Age'] +  # Age factor
        np.where(df['Sex'] == 'M', 50, 30) +  # Sex factor
        np.random.normal(0, 20, n_samples)  # Random noise
    )
    
    # Ensure positive calories
    df['Calories'] = np.clip(df['Calories'], 10, 500)
    
    return df

# Generate training and test data
print("Generating synthetic training data...")
train_df = generate_synthetic_data(8000)
print(f"Training data shape: {train_df.shape}")

print("\nGenerating synthetic test data...")
test_df = generate_synthetic_data(2000)
print(f"Test data shape: {test_df.shape}")

# Display basic info
train_df.head()

## Basic EDA

Let's explore the generated data to understand the distributions and relationships.

In [None]:
# Basic statistics
print("Training data info:")
train_df.info()
print("\n" + "="*50)
print("\nTarget variable statistics:")
print(train_df['Calories'].describe())

# Check for missing values
print("\nMissing values:")
print(train_df.isnull().sum())

In [None]:
# Visualize distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Feature Distributions', fontsize=16)

features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
for i, feature in enumerate(features):
    row, col = i // 3, i % 3
    axes[row, col].hist(train_df[feature], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[row, col].set_title(feature)
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Target distribution
plt.figure(figsize=(10, 6))
plt.hist(train_df['Calories'], bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
plt.title('Target Variable (Calories) Distribution')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

## Feature Engineering

Based on the winning solutions, we'll create:
1. Log transformations of features
2. Interaction features (products, ratios)
3. Bin features for CatBoost-style models

In [None]:
def engineer_features(df):
    """Create engineered features"""
    df = df.copy()
    
    # Log transformations
    numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    for col in numeric_features:
        df[f'log1p_{col}'] = np.log1p(df[col])
    
    # Interaction features (products)
    # Based on Chris Deotte's winning solution
    for i, col1 in enumerate(numeric_features):
        for col2 in numeric_features[i+1:]:
            df[f'product_{col1}_{col2}'] = df[col1] * df[col2]
            df[f'ratio_{col1}_{col2}'] = df[col1] / (df[col2] + 1e-6)
    
    # BMI feature
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    
    # Heart rate efficiency (calories per heart rate unit)
    df['HR_efficiency'] = df['Calories'] / (df['Heart_Rate'] + 1e-6)
    
    # Duration efficiency (calories per minute)
    df['Duration_efficiency'] = df['Calories'] / (df['Duration'] + 1e-6)
    
    # Sex encoding
    df['Sex_M'] = (df['Sex'] == 'M').astype(int)
    df['Sex_F'] = (df['Sex'] == 'F').astype(int)
    
    return df

# Apply feature engineering
print("Engineering features for training data...")
train_fe = engineer_features(train_df)
print(f"Training features shape: {train_fe.shape}")

print("\nEngineering features for test data...")
test_fe = engineer_features(test_df)
print(f"Test features shape: {test_fe.shape}")

# Show new features
new_features = [col for col in train_fe.columns if col not in train_df.columns]
print(f"\nNumber of new features created: {len(new_features)}")
print("Sample new features:", new_features[:10])

## Prepare Data for Modeling

Separate features and target, and prepare for cross-validation.

In [None]:
# Separate features and target
TARGET = 'Calories'
ID_COL = 'id'

# Drop ID and original target from features
feature_cols = [col for col in train_fe.columns if col not in [TARGET, ID_COL, 'Sex']]

X_train = train_fe[feature_cols]
y_train = train_fe[TARGET]
X_test = test_fe[feature_cols]

print(f"Training features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Number of features: {len(feature_cols)}")

# Display first few feature names
print("\nSample features:", feature_cols[:10])

## Cross-Validation Setup

Use 5-fold CV as mentioned in winning solutions. We'll use RMSLE as the evaluation metric.

In [None]:
def rmsle(y_true, y_pred):
    """Root Mean Squared Logarithmic Error"""
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Setup cross-validation
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

print(f"Using {N_FOLDS}-fold cross-validation with random seed {SEED}")

# Initialize arrays to store predictions
oof_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))

# Store CV scores
cv_scores = []

## Model Training

Train XGBoost model with early stopping. Based on winning solutions, XGBoost performed well.

In [None]:
# XGBoost parameters (based on winning solutions)
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 1000,
    'random_state': SEED,
    'n_jobs': -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

print("Training XGBoost model with cross-validation...")
print("Parameters:", xgb_params)

fold = 0
for train_idx, valid_idx in kf.split(X_train):
    fold += 1
    print(f"\nFold {fold}/{N_FOLDS}")
    
    # Split data
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # Train model
    model = xgb.XGBRegressor(**xgb_params)
    
    # Fit with early stopping
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val)
    oof_predictions[valid_idx] = val_pred
    
    # Calculate CV score for this fold
    fold_score = rmsle(y_val, val_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold} RMSLE: {fold_score:.5f}")
    
    # Predict on test set
    test_pred = model.predict(X_test)
    test_predictions += test_pred / N_FOLDS
    
    # Clean up
    del model

print("\n" + "="*50)
print("CROSS-VALIDATION RESULTS")
print("="*50)
print(f"Mean RMSLE: {np.mean(cv_scores):.5f}")
print(f"Std RMSLE: {np.std(cv_scores):.5f}")
print(f"Fold scores: {[f'{score:.5f}' for score in cv_scores]}")

# Overall OOF score
oof_score = rmsle(y_train, oof_predictions)
print(f"\nOverall OOF RMSLE: {oof_score:.5f}")

## Feature Importance

Let's examine which features are most important for the model.

In [None]:
# Get feature importance from the last fold model
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importance.head(20))

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.barh(feature_importance.head(15)['feature'], feature_importance.head(15)['importance'])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance')
plt.gca().invert_yaxis()
plt.show()

## Generate Submission

Create the submission file in the correct format.

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': test_predictions
})

# Clip predictions to reasonable range (based on training data)
min_calories = train_df['Calories'].min()
max_calories = train_df['Calories'].max()
submission['Calories'] = np.clip(submission['Calories'], min_calories, max_calories)

print("Submission statistics:")
print(submission['Calories'].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved to: {submission_path}")
print("\nFirst 5 rows of submission:")
print(submission.head())

# Also save OOF predictions for potential ensemble use
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'Calories': oof_predictions
})
oof_path = '/home/submission/oof_predictions.csv'
oof_df.to_csv(oof_path, index=False)
print(f"\nOOF predictions saved to: {oof_path}")

## Summary

This baseline experiment:
1. Generated synthetic data based on competition description
2. Created interaction features (products, ratios)
3. Trained XGBoost with 5-fold CV
4. Achieved RMSLE score (will be shown after execution)
5. Generated submission file

**Next steps:**
- Try different models (CatBoost, LightGBM, Neural Networks)
- More sophisticated feature engineering
- Ensemble methods (hill climbing, Ridge regression)
- Use original dataset if available