# CatBoost Baseline Model

CatBoost model that can handle categorical features natively.

Features to include:
- Original features (including categorical 'Sex')
- Log1p transformations
- Product features (Weight*Duration, Duration*Heart_Rate, Height*Weight)
- Ratio features (Weight/Height)
- BMI feature

Model: CatBoost with 1000 iterations, learning_rate=0.05, depth=6

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

# Load data
print("Loading data...")
train_df = pd.read_csv('/home/code/data/train.csv')
test_df = pd.read_csv('/home/code/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

In [None]:
# Feature engineering function
def create_features(df):
    """Create engineered features for the model"""
    df_new = df.copy()
    
    # Original numerical features
    num_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    
    # Log1p transformations
    for col in num_features:
        df_new[f'{col}_log1p'] = np.log1p(df_new[col])
    
    # Product features (from winning solutions)
    df_new['Weight_Duration'] = df_new['Weight'] * df_new['Duration']
    df_new['Duration_Heart_Rate'] = df_new['Duration'] * df_new['Heart_Rate']
    df_new['Height_Weight'] = df_new['Height'] * df_new['Weight']
    
    # Ratio features
    df_new['Weight_Height'] = df_new['Weight'] / (df_new['Height'] + 1e-6)
    
    # BMI feature (Body Mass Index approximation)
    df_new['BMI'] = df_new['Weight'] / ((df_new['Height'] / 100) ** 2 + 1e-6)
    
    return df_new

# Create features for train and test
print("Creating features...")
train_feat = create_features(train_df)
test_feat = create_features(test_df)

# Define feature columns (exclude id and target)
feature_cols = [col for col in train_feat.columns if col not in ['id', 'Calories']]
print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Define categorical features
cat_features = ['Sex'] if 'Sex' in feature_cols else []
print(f"Categorical features: {cat_features}")

In [None]:
# Prepare data
X = train_feat[feature_cols]
y = train_feat['Calories']
X_test = test_feat[feature_cols]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
fold_scores = []

print("Starting cross-validation...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}/5")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create CatBoost pools
    train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
    val_pool = Pool(X_val, label=y_val, cat_features=cat_features)
    
    # Define model parameters
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='RMSE',
        eval_metric='RMSE',
        random_seed=SEED,
        verbose=False,
        early_stopping_rounds=50
    )
    
    # Train model
    model.fit(train_pool, eval_set=val_pool, verbose=False)
    
    # Predict on validation set
    val_pred = model.predict(val_pool)
    oof_predictions[val_idx] = val_pred
    
    # Calculate RMSLE for this fold
    # Clip predictions to avoid negative values in log
    val_pred_clipped = np.clip(val_pred, 0, None)
    fold_score = np.sqrt(mean_squared_log_error(y_val, val_pred_clipped))
    fold_scores.append(fold_score)
    
    print(f"Fold {fold + 1} RMSLE: {fold_score:.6f}")
    print(f"Best iteration: {model.best_iteration_}")
    
    # Predict on test set
    test_pool = Pool(X_test, cat_features=cat_features)
    test_pred = model.predict(test_pool)
    test_predictions += test_pred / 5  # Average across folds

# Calculate overall CV score
oof_predictions_clipped = np.clip(oof_predictions, 0, None)
cv_score = np.sqrt(mean_squared_log_error(y, oof_predictions_clipped))

print(f"\n{'='*50}")
print(f"Cross-validation RMSLE: {cv_score:.6f}")
print(f"Fold scores: {fold_scores}")
print(f"Mean ± Std: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
print(f"{'='*50}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': test_predictions
})

# Clip predictions to training data range to prevent unrealistic values
train_min = train_df['Calories'].min()
train_max = train_df['Calories'].max()
submission['Calories'] = submission['Calories'].clip(train_min, train_max)

print(f"Submission shape: {submission.shape}")
print(f"Calories range in submission: [{submission['Calories'].min():.2f}, {submission['Calories'].max():.2f}]")
print(f"Calories range in training: [{train_min:.2f}, {train_max:.2f}]")

# Save submission
submission_path = '/home/code/submission_candidates/candidate_002_catboost_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Save OOF predictions for ensemble
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'Calories': oof_predictions
})
oof_path = '/home/code/experiments/oof_002_catboost_baseline.csv'
oof_df.to_csv(oof_path, index=False)
print(f"OOF predictions saved to: {oof_path}")