In [1]:
!python -V

Python 3.11.11


# Import lib and Check Input and read

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_path = "/kaggle/input/playground-series-s5e5/train.csv"
test_path = "/kaggle/input/playground-series-s5e5/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

test_ids = test_df['id']

# Data Pre-Processing

In [None]:
train_df['Sex_Reversed'] = train_df['Sex'].map({'male': 1, 'female': 0})
test_df['Sex_Reversed'] = test_df['Sex'].map({'male': 1, 'female': 0})

train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

train_df['BMI'] = train_df['Weight'] / (train_df['Height']/100)**2
test_df['BMI'] = test_df['Weight'] / (test_df['Height']/100)**2

train_df['Duration_HR'] = train_df['Duration'] * train_df['Heart_Rate']
test_df['Duration_HR'] = test_df['Duration'] * test_df['Heart_Rate']

train_df['Duration2_HR'] = (train_df['Duration'])**2 * train_df['Heart_Rate']
test_df['Duration2_HR'] = (test_df['Duration'])**2 * test_df['Heart_Rate']

train_df['Intensity'] = train_df['Heart_Rate'] / train_df['Duration']
test_df['Intensity'] = test_df['Heart_Rate'] / test_df['Duration']

for f1 in ['Duration', 'Heart_Rate', 'Body_Temp']:
        for f2 in ['Sex', 'Sex_Reversed']:
            train_df[f'{f1}_x_{f2}'] = train_df[f1] * train_df[f2]
for f1 in ['Duration', 'Heart_Rate', 'Body_Temp']:
        for f2 in ['Sex', 'Sex_Reversed']:
            test_df[f'{f1}_x_{f2}'] = test_df[f1] * test_df[f2]

train_df['Body_Temp'] = train_df['Body_Temp'] - 37.0
test_df['Body_Temp'] = test_df['Body_Temp'] - 37.0

for col in ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']:
        for agg in ['min', 'max']:
            agg_val = train_df.groupby('Sex')[col].agg(agg).rename(f'Sex_{col}_{agg}')
            train_df = train_df.merge(agg_val, on='Sex', how='left')
for col in ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']:
        for agg in ['min', 'max']:
            agg_val = test_df.groupby('Sex')[col].agg(agg).rename(f'Sex_{col}_{agg}')
            test_df = test_df.merge(agg_val, on='Sex', how='left')

train_df.drop(columns=['Sex_Reversed'], inplace=True)
test_df.drop(columns=['Sex_Reversed'], inplace=True)

## Scaling Numeric Features

In [None]:
features = train_df.columns.tolist()
features.remove('Calories')
print(features)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
test_df.head()

In [None]:
test_df.tail()

In [None]:
X_train = train_df[features]
y_train = train_df['Calories']

X_test = test_df[features]

X_train = X_train.fillna(0) # Fill NaNs in training features
X_test = X_test.fillna(0) # Fill NaNs in test features

# Functions

## RMSLE Scorer

In [None]:
def rmsle_scorer(y_true, y_pred):
    y_pred_positive = np.maximum(y_pred, 0.001) 
    return np.sqrt(mean_squared_log_error(y_true, y_pred_positive))

## KFold CV

In [None]:
def run_kfold_cv(X, y, model, model_name, n_splits, random_state=42):
    print(f"\n--- Starting {n_splits}-Fold Cross-Validation for {model_name} ---")
    start_cv_time = time.time() # Start timing for the entire CV process

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    fold_rmsle_scores = []
    oof_predictions = np.zeros(X.shape[0]) 
    
    fold_times = [] 

    # Iterate through each fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        fold_start_time = time.time() # Start timing for the current fold

        # Split data for the current fold
        X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
        X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
        
        model_fold = model.__class__(**model.get_params()) 
        model_fold.fit(X_train_fold, y_train_fold) # Train the model on the training fold
        
        val_preds = model_fold.predict(X_val_fold) # Make predictions on the validation fold
        
        val_preds[val_preds < 0] = 0.001 
        
        # Store out-of-fold predictions
        oof_predictions[val_idx] = val_preds

        # Evaluate the model's performance on the validation set for this fold using RMSLE
        try:
            fold_rmsle = np.sqrt(mean_squared_log_error(y_val_fold, val_preds))
            fold_rmsle_scores.append(fold_rmsle)
        except ValueError as e:
            print(f"  Warning: Error calculating RMSLE for Fold {fold + 1} ({model_name}): {e}. Setting RMSLE to NaN.")
            fold_rmsle_scores.append(np.nan)

        fold_end_time = time.time() # End timing for the current fold
        fold_duration = fold_end_time - fold_start_time
        fold_times.append(fold_duration)

    end_cv_time = time.time() # End timing for the entire CV process
    total_cv_time = end_cv_time - start_cv_time

    # Summarize results
    valid_fold_rmsle_scores = [s for s in fold_rmsle_scores if not np.isnan(s)]
    
    mean_cv_rmsle = np.nan
    std_cv_rmsle = np.nan
    overall_oof_rmsle = np.nan

    if valid_fold_rmsle_scores:
        mean_cv_rmsle = np.mean(valid_fold_rmsle_scores)
        std_cv_rmsle = np.std(valid_fold_rmsle_scores)
        print(f"\n--- {n_splits}-Fold CV Summary for {model_name} ---")
        print(f"Average RMSLE: {mean_cv_rmsle:.4f} +/- {std_cv_rmsle:.4f}")
    else:
        print(f"\n--- {n_splits}-Fold CV Summary for {model_name} ---")
        print(f"RMSLE calculation failed for all folds.")

    # Calculate overall OOF RMSLE if possible
    if y.min() >= 0 and oof_predictions.min() >= 0 and valid_fold_rmsle_scores:
        try:
            overall_oof_rmsle = np.sqrt(mean_squared_log_error(y, oof_predictions))
            print(f"Overall OOF RMSLE: {overall_oof_rmsle:.4f}")
        except ValueError as e:
            print(f"Error calculating Overall OOF RMSLE for {model_name}: {e}. Ensure target and predictions are non-negative.")
    
    return {
        'Model': model_name,
        'N_Splits': n_splits,
        'Average RMSLE': mean_cv_rmsle,
        'Std RMSLE': std_cv_rmsle,
        'Overall OOF RMSLE': overall_oof_rmsle,
        'Total CV Time (s)': total_cv_time,
        'Avg Fold Time (s)': np.mean(fold_times) if fold_times else np.nan
    }

# Hyperparameters

In [None]:
n_splits_list = [5, 10, 15]

RANDOM_STATE = 42

# Models


In [None]:
# models = {
#     "Linear Regression": LinearRegression(),
#     "Decision Tree Regressor": DecisionTreeRegressor(random_state=RANDOM_STATE),
#     "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1),
#     "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=RANDOM_STATE),
#     "XGBoost Regressor": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=RANDOM_STATE, n_jobs=-1),
#     "LightGBM Regressor": lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, num_leaves=31, random_state=RANDOM_STATE, n_jobs=-1),
#     "CatBoost Regressor": CatBoostRegressor(iterations=100, learning_rate=0.1, depth=3, random_state=RANDOM_STATE, verbose=0, thread_count=-1)
# }

# Assuming cuml is installed and environment is set up for GPU
from cuml.linear_model import LinearRegression as LinearRegression
from cuml.ensemble import RandomForestRegressor as CRandomForestRegressor
from cuml.tree import DecisionTreeRegressor as DecisionTreeRegressor
from cuml.ensemble import GradientBoostingRegressor as GradientBoostingRegressor # Note: cuML's GradientBoosting is still evolving

models = {
    # "Linear Regression": CUML_LinearRegression(), # Example of how you would replace it
    "Linear Regression": LinearRegression(), # Keep original if cuML not used
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=RANDOM_STATE), # Keep original if cuML not used
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1), # Keep original if cuML not used
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=RANDOM_STATE), # Keep original if cuML not used
    "XGBoost Regressor": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=RANDOM_STATE, tree_method='hist', device='cuda', n_jobs=-1),
    "LightGBM Regressor": lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, num_leaves=31, random_state=RANDOM_STATE, device='gpu', n_jobs=-1),
    "CatBoost Regressor": CatBoostRegressor(iterations=100, learning_rate=0.1, depth=3, random_state=RANDOM_STATE, verbose=0, thread_count=-1, task_type='GPU')
}

# KFold and Train and Save CSV

In [None]:
# List to store all CV results for comparison table
all_cv_results = []

# --- Run K-Fold CV for Each Model ---
for model_name, model_instance in models.items():
    print(f"\n{'='*80}\nRunning K-Fold Cross-Validation for: {model_name}\n{'='*80}")
    for n_splits_val in n_splits_list:
        results = run_kfold_cv(X_train, y_train, model_instance, model_name, n_splits_val, RANDOM_STATE)
        all_cv_results.append(results)
    
    # --- Train on Full X_train and Predict on X_test (after CV for this model) ---
    print(f"\n--- Training {model_name} on full X_train and predicting on X_test ---")
    final_model = model_instance.__class__(**model_instance.get_params()) # Create a fresh instance for final training
    
    start_time_full_train = time.time()
    final_model.fit(X_train, y_train)
    end_time_full_train = time.time()
    print(f"Full training complete in {(end_time_full_train - start_time_full_train):.4f} seconds.")

    start_time_predict = time.time()
    predictions_test = final_model.predict(X_test)
    end_time_predict = time.time()
    print(f"Predictions made in {(end_time_predict - start_time_predict):.4f} seconds.")

    # Handle negative predictions for submission file
    predictions_test[predictions_test < 0] = 0.001 #np.abs(predictions_test)
    print(predictions_test)

    # Save predictions to CSV
    submission_df = pd.DataFrame({'id': test_ids, 'Predictions': predictions_test})
    csv_filename = f'{model_name.replace(" ", "_")}_predictions.csv'
    submission_df.to_csv(csv_filename, index=False)
    print(f"Submission file '{csv_filename}' created successfully.")

    # Print feature importances if available (for tree-based models)
    if hasattr(final_model, 'feature_importances_'):
        print(f"Feature Importances for {model_name}:")
        # Map feature importances to original feature names
        feature_importances_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': final_model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print(feature_importances_df.to_string(index=False))
    print(f"{'-'*80}") # Separator after each model's full training/prediction

# Final Comparison Table

In [None]:
# --- Print Final Comparison Table ---
print("\n" + "="*120) # Adjusted width for new columns and more models
print("                                Cross-Validation Summary Across Different Models and Folds                                ")
print("="*120)

# Create a DataFrame from the results for a nice tabular output
results_df = pd.DataFrame(all_cv_results)

# Sort for better comparison: by Model, then by N_Splits
results_df = results_df.sort_values(by=['Model', 'N_Splits']).reset_index(drop=True)

# Format the numerical columns for better readability
results_df['Average RMSLE'] = results_df['Average RMSLE'].map('{:.4f}'.format)
results_df['Std RMSLE'] = results_df['Std RMSLE'].map('{:.4f}'.format)
results_df['Overall OOF RMSLE'] = results_df['Overall OOF RMSLE'].map('{:.4f}'.format)
results_df['Total CV Time (s)'] = results_df['Total CV Time (s)'].map('{:.4f}'.format)
results_df['Avg Fold Time (s)'] = results_df['Avg Fold Time (s)'].map('{:.4f}'.format)

# Print the DataFrame
print(results_df.to_string(index=False))
print("="*120)
