In [None]:
!python -V

# Import lib and Check Input

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data

In [None]:
train_path = "/kaggle/input/playground-series-s5e5/train.csv"
test_path = "/kaggle/input/playground-series-s5e5/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
test_ids = test_df['id']

# Data Pre-Processing

## Encoding Categorical Variables

In [None]:
train_df['Sex_Reversed'] = train_df['Sex'].map({'male': 1.0, 'female': 0.0})
test_df['Sex_Reversed'] = test_df['Sex'].map({'male': 1.0, 'female': 0.0})

In [None]:
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

## Feature Engineering

In [None]:
train_df['BMI'] = train_df['Weight'] / (train_df['Height']/100)**2
test_df['BMI'] = test_df['Weight'] / (test_df['Height']/100)**2

In [None]:
train_df['Duration_HR'] = train_df['Duration'] * train_df['Heart_Rate']
test_df['Duration_HR'] = test_df['Duration'] * test_df['Heart_Rate']

In [None]:
train_df['Duration2_HR'] = (train_df['Duration'])**2 * train_df['Heart_Rate']
test_df['Duration2_HR'] = (test_df['Duration'])**2 * test_df['Heart_Rate']

In [None]:
train_df['Intensity'] = train_df['Heart_Rate'] / train_df['Duration']
test_df['Intensity'] = test_df['Heart_Rate'] / test_df['Duration']

In [None]:
for f1 in ['Duration', 'Heart_Rate', 'Body_Temp']:
        for f2 in ['Sex', 'Sex_Reversed']:
            train_df[f'{f1}_x_{f2}'] = train_df[f1] * train_df[f2]
for f1 in ['Duration', 'Heart_Rate', 'Body_Temp']:
        for f2 in ['Sex', 'Sex_Reversed']:
            test_df[f'{f1}_x_{f2}'] = test_df[f1] * test_df[f2]

In [None]:
train_df['Body_Temp'] = train_df['Body_Temp'] - 37.0
test_df['Body_Temp'] = test_df['Body_Temp'] - 37.0

In [None]:
for col in ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']:
        for agg in ['min', 'max']:
            agg_val = train_df.groupby('Sex')[col].agg(agg).rename(f'Sex_{col}_{agg}')
            train_df = train_df.merge(agg_val, on='Sex', how='left')
for col in ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']:
        for agg in ['min', 'max']:
            agg_val = test_df.groupby('Sex')[col].agg(agg).rename(f'Sex_{col}_{agg}')
            test_df = test_df.merge(agg_val, on='Sex', how='left')

In [None]:
train_df.drop(columns=['Sex_Reversed'], inplace=True)
test_df.drop(columns=['Sex_Reversed'], inplace=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Scaling Numeric Features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
features = train_df.columns.tolist()
features.remove('Calories')
print(features)

In [None]:
train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

# Model

# 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X_train = train_df[features]
y_train = train_df['Calories']

X_test = test_df[features]

X_train = X_train.fillna(0) # Fill NaNs in training features
X_test = X_test.fillna(0) # Fill NaNs in test features

## K Fold CV

In [None]:
import numpy as np
import time
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import pandas as pd

In [None]:
# Function to perform K-Fold Cross-Validation
def run_kfold_cv(X, y, n_splits, random_state=42):
    print(f"\n--- Starting {n_splits}-Fold Cross-Validation ---")
    start_cv_time = time.time() # Start timing for the entire CV process

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    fold_rmsle_scores = []
    oof_predictions = np.zeros(X.shape[0])
    
    # Store individual fold times
    fold_times = [] 

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        fold_start_time = time.time() # Start timing for the current fold

        X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
        X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
        
        model_fold = LinearRegression()
        model_fold.fit(X_train_fold, y_train_fold)
        
        val_preds = model_fold.predict(X_val_fold)
        val_preds[val_preds < 0] = 0.001 # Ensure non-negative predictions for RMSLE, replace negatives with small epsilon
        
        oof_predictions[val_idx] = val_preds

        try:
            fold_rmsle = np.sqrt(mean_squared_log_error(y_val_fold, val_preds))
            fold_rmsle_scores.append(fold_rmsle)
        except ValueError as e:
            print(f"  Warning: Error calculating RMSLE for Fold {fold + 1}: {e}. Setting RMSLE to NaN.")
            fold_rmsle_scores.append(np.nan)

        fold_end_time = time.time() # End timing for the current fold
        fold_duration = fold_end_time - fold_start_time
        fold_times.append(fold_duration)
        # print(f"  Fold {fold + 1}/{n_splits} completed in {fold_duration:.4f} seconds.") # Optional: print per-fold time

    end_cv_time = time.time() # End timing for the entire CV process
    total_cv_time = end_cv_time - start_cv_time

    valid_fold_rmsle_scores = [s for s in fold_rmsle_scores if not np.isnan(s)]
    
    mean_cv_rmsle = np.nan
    std_cv_rmsle = np.nan
    overall_oof_rmsle = np.nan

    if valid_fold_rmsle_scores:
        mean_cv_rmsle = np.mean(valid_fold_rmsle_scores)
        std_cv_rmsle = np.std(valid_fold_rmsle_scores)
        print(f"\n--- {n_splits}-Fold CV Summary ---")
        print(f"Average RMSLE: {mean_cv_rmsle:.4f} +/- {std_cv_rmsle:.4f}")
    else:
        print(f"\n--- {n_splits}-Fold CV Summary ---")
        print(f"RMSLE calculation failed for all folds.")

    if y.min() >= 0 and oof_predictions.min() >= 0 and valid_fold_rmsle_scores:
        try:
            overall_oof_rmsle = np.sqrt(mean_squared_log_error(y, oof_predictions))
            print(f"Overall OOF RMSLE: {overall_oof_rmsle:.4f}")
        except ValueError as e:
            print(f"Error calculating Overall OOF RMSLE: {e}. Ensure target and predictions are non-negative.")
    
    return {
        'N_Splits': n_splits,
        'Average RMSLE': mean_cv_rmsle,
        'Std RMSLE': std_cv_rmsle,
        'Overall OOF RMSLE': overall_oof_rmsle,
        'Total CV Time (s)': total_cv_time,
        'Avg Fold Time (s)': np.mean(fold_times) if fold_times else np.nan
    }

In [None]:
# List of n_splits values to test
n_splits_list = [5, 10, 15, 20, 50]

all_cv_results = []

for n_splits_val in n_splits_list:
    results = run_kfold_cv(X_train, y_train, n_splits_val)
    all_cv_results.append(results)

In [None]:
# Print the comparison table
print("\n" + "="*85) # Adjusted width for new columns
print("             Cross-Validation Summary Across Different Folds             ")
print("="*85)
# Create a DataFrame from the results for a nice tabular output
results_df = pd.DataFrame(all_cv_results)
# Format the numerical columns for better readability
results_df['Average RMSLE'] = results_df['Average RMSLE'].map('{:.4f}'.format)
results_df['Std RMSLE'] = results_df['Std RMSLE'].map('{:.4f}'.format)
results_df['Overall OOF RMSLE'] = results_df['Overall OOF RMSLE'].map('{:.4f}'.format)
results_df['Total CV Time (s)'] = results_df['Total CV Time (s)'].map('{:.4f}'.format)
results_df['Avg Fold Time (s)'] = results_df['Avg Fold Time (s)'].map('{:.4f}'.format)
print(results_df.to_string(index=False))
print("="*85)

## Train

In [None]:
%%time
model = LinearRegression()

In [None]:
%%time
print("Training Linear Regression model...")
model.fit(X_train, y_train)
print("Training complete.")

## Predict

In [None]:
%%time
predictions = model.predict(X_test)

## Handle negative predictions And Save CSV

In [None]:
ctr_x = 0
for i in range(len(predictions)):
    if predictions[i]<0:
        ctr_x+=1
        # print(f"i : {i} ;\t; predictions[{i}] : {predictions[i]}")
        predictions[i]=-predictions[i]

print(ctr_x)

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions})

submission_df.to_csv('linear_regression_submission.csv', index=False)
print("\nSubmission file 'linear_regression_submission.csv' created successfully.")

## Model Coefficients

In [None]:
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef:.6f}")

# 2. Decision Tree Regressor

In [None]:
import numpy as np
import time
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor # Import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer
import pandas as pd

# Define RMSLE as a custom scorer (useful if you decide to use cross_val_score later)
def rmsle_scorer(y_true, y_pred):
    y_pred_positive = np.maximum(y_pred, 0.001) # Ensure predictions are positive
    return np.sqrt(mean_squared_log_error(y_true, y_pred_positive))

# Function to perform K-Fold Cross-Validation for a given model
def run_kfold_cv(X, y, model, model_name, n_splits, random_state=42):
    print(f"\n--- Starting {n_splits}-Fold Cross-Validation for {model_name} ---")
    start_cv_time = time.time() # Start timing for the entire CV process

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    fold_rmsle_scores = []
    oof_predictions = np.zeros(X.shape[0])
    
    fold_times = [] 

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        fold_start_time = time.time() # Start timing for the current fold

        X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
        X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
        
        # Create a new model instance for each fold to ensure it's untrained
        model_fold = model.__class__(**model.get_params()) 
        model_fold.fit(X_train_fold, y_train_fold)
        
        val_preds = model_fold.predict(X_val_fold)
        val_preds[val_preds < 0] = 0.001 # Ensure non-negative predictions for RMSLE
        
        oof_predictions[val_idx] = val_preds

        try:
            fold_rmsle = np.sqrt(mean_squared_log_error(y_val_fold, val_preds))
            fold_rmsle_scores.append(fold_rmsle)
        except ValueError as e:
            print(f"  Warning: Error calculating RMSLE for Fold {fold + 1} ({model_name}): {e}. Setting RMSLE to NaN.")
            fold_rmsle_scores.append(np.nan)

        fold_end_time = time.time() # End timing for the current fold
        fold_duration = fold_end_time - fold_start_time
        fold_times.append(fold_duration)

    end_cv_time = time.time() # End timing for the entire CV process
    total_cv_time = end_cv_time - start_cv_time

    valid_fold_rmsle_scores = [s for s in fold_rmsle_scores if not np.isnan(s)]
    
    mean_cv_rmsle = np.nan
    std_cv_rmsle = np.nan
    overall_oof_rmsle = np.nan

    if valid_fold_rmsle_scores:
        mean_cv_rmsle = np.mean(valid_fold_rmsle_scores)
        std_cv_rmsle = np.std(valid_fold_rmsle_scores)
        print(f"\n--- {n_splits}-Fold CV Summary for {model_name} ---")
        print(f"Average RMSLE: {mean_cv_rmsle:.4f} +/- {std_cv_rmsle:.4f}")
    else:
        print(f"\n--- {n_splits}-Fold CV Summary for {model_name} ---")
        print(f"RMSLE calculation failed for all folds.")

    if y.min() >= 0 and oof_predictions.min() >= 0 and valid_fold_rmsle_scores:
        try:
            overall_oof_rmsle = np.sqrt(mean_squared_log_error(y, oof_predictions))
            print(f"Overall OOF RMSLE: {overall_oof_rmsle:.4f}")
        except ValueError as e:
            print(f"Error calculating Overall OOF RMSLE for {model_name}: {e}. Ensure target and predictions are non-negative.")
    
    return {
        'Model': model_name, # Added model name
        'N_Splits': n_splits,
        'Average RMSLE': mean_cv_rmsle,
        'Std RMSLE': std_cv_rmsle,
        'Overall OOF RMSLE': overall_oof_rmsle,
        'Total CV Time (s)': total_cv_time,
        'Avg Fold Time (s)': np.mean(fold_times) if fold_times else np.nan
    }

# Define models
model_lr = LinearRegression()
model_dt = DecisionTreeRegressor(random_state=42) # Ensure random_state for reproducibility

# List of n_splits values to test
n_splits_list = [5, 10, 15, 20, 50]

# List to store results for comparison
all_cv_results = []

print("Running Cross-Validation for Linear Regression:")
for n_splits_val in n_splits_list:
    results = run_kfold_cv(X_train, y_train, model_lr, "Linear Regression", n_splits_val)
    all_cv_results.append(results)

print("\n" + "="*80)
print("Running Cross-Validation for Decision Tree Regressor:")
for n_splits_val in n_splits_list:
    results = run_kfold_cv(X_train, y_train, model_dt, "Decision Tree Regressor", n_splits_val)
    all_cv_results.append(results)

# Print the comparison table
print("\n" + "="*100) # Adjusted width for new columns
print("                      Cross-Validation Summary Across Different Models and Folds                       ")
print("="*100)

# Create a DataFrame from the results for a nice tabular output
results_df = pd.DataFrame(all_cv_results)

# Sort for better comparison
results_df = results_df.sort_values(by=['Model', 'N_Splits']).reset_index(drop=True)

# Format the numerical columns for better readability
results_df['Average RMSLE'] = results_df['Average RMSLE'].map('{:.4f}'.format)
results_df['Std RMSLE'] = results_df['Std RMSLE'].map('{:.4f}'.format)
results_df['Overall OOF RMSLE'] = results_df['Overall OOF RMSLE'].map('{:.4f}'.format)
results_df['Total CV Time (s)'] = results_df['Total CV Time (s)'].map('{:.4f}'.format)
results_df['Avg Fold Time (s)'] = results_df['Avg Fold Time (s)'].map('{:.4f}'.format)

print(results_df.to_string(index=False))
print("="*100)

In [None]:
from sklearn.tree import DecisionTreeRegressor


test_size=0.2

random_state=42


model_dt = DecisionTreeRegressor(random_state=random_state)


print("Training Decision Tree Regressor model...")

# 4. Train the model

model_dt.fit(X_train, y_train)

print("Training complete.")


print("Making predictions with Decision Tree Regressor...")

# 5. Make predictions on the test data

predictions_dt = model_dt.predict(X_test)

print("Predictions made.")


# 6. Print the predictions

print("\nTest Data Predictions:")

print(predictions_dt) 

## Save submission.csv

In [None]:
for i in range(len(predictions_dt)):
    if predictions_dt[i]<0:
        predictions_dt[i]=-predictions_dt[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_dt})

submission_df.to_csv('Decision_Tree_Regressor.csv', index=False)
print("\nSubmission file 'Decision_Tree_Regressor.csv' created successfully.")

# 3. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

n_estimators=100
random_state=42

# 3. Initialize the Random Forest Regressor model
# model_rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
model_rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
# n_jobs=-1 uses all available CPU cores for parallel processing, often speeding up training.

start_time = time.time()  # Record start time
print("Training Random Forest Regressor model...")
# 4. Train the model
model_rf.fit(X_train, y_train)
print("Training complete.")
end_time = time.time()  # Record end time
print(f"Training Random Forest Regressor model.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

start_time = time.time()  # Record start time
print("Making predictions with Random Forest Regressor...")
# 5. Make predictions on the test data
predictions_rf = model_rf.predict(X_test)
print("Predictions made.")
end_time = time.time()  # Record end time
print(f"Making predictions with Random Forest Regressor.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

# 6. Print the predictions
print("\nSample Test Data Predictions:")
print(predictions_rf)

## Save submission.csv

In [None]:
for i in range(len(predictions_rf)):
    if predictions_rf[i]<0:
        predictions_rf[i]=-predictions_rf[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_rf})

submission_df.to_csv('Random_Forest_Regressor.csv', index=False)
print("\nSubmission file 'Random_Forest_Regressor.csv' created successfully.")

In [None]:
print(model_rf.feature_importances_)

# 4. Gradient Boosting Regressor (from scikit-learn)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

n_estimators=100
learning_rate=0.1
max_depth=3
random_state=42

model_gbr = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)

print("Training Gradient Boosting Regressor model...")
start_time = time.time()
# 4. Train the model
model_gbr.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with Gradient Boosting Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_gbr = model_gbr.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nSample Test Data Predictions (Gradient Boosting Regressor):")
print(predictions_gbr)

## Save submission.csv

In [None]:
for i in range(len(predictions_gbr)):
    if predictions_gbr[i]<0:
        predictions_gbr[i]=-predictions_gbr[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_gbr})

submission_df.to_csv('Gradient_Boosting_Regressor.csv', index=False)
print("\nSubmission file 'Gradient_Boosting_Regressor.csv' created successfully.")

# 4. XGBoost Regressor

In [None]:
import xgboost as xgb

n_estimators=100
learning_rate=0.1
max_depth=3
random_state=42
n_jobs=-1
model_xgb = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state, n_jobs=n_jobs)

print("Training XGBoost Regressor model...")
start_time = time.time()
# 4. Train the model
model_xgb.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with XGBoost Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_xgb = model_xgb.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nSample Test Data Predictions (XGBoost Regressor):")
print(predictions_xgb)

## Save submission.csv

In [None]:
for i in range(len(predictions_xgb)):
    if predictions_xgb[i]<0:
        predictions_xgb[i]=-predictions_xgb[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_xgb})

submission_df.to_csv('xgboost.csv', index=False)
print("\nSubmission file 'xgboost.csv' created successfully.")

# 5. LightGBM Regressor

In [None]:
import lightgbm as lgb

n_estimators=100
learning_rate=0.1
# max_depth=3
num_leaves=31
random_state=42
n_jobs=-1

model_lgb = lgb.LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves, random_state=random_state, n_jobs=n_jobs)

print("Training LightGBM Regressor model...")
start_time = time.time()
# 4. Train the model
model_lgb.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with LightGBM Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_lgb = model_lgb.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nSample Test Data Predictions (LightGBM Regressor):")
print(predictions_lgb)

## Save submission.csv

In [None]:
for i in range(len(predictions_lgb)):
    if predictions_lgb[i]<0:
        predictions_lgb[i]=-predictions_lgb[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_lgb})

submission_df.to_csv('LightGBM.csv', index=False)
print("\nSubmission file 'LightGBM.csv' created successfully.")

# 6. CatBoost Regressor

In [None]:
from catboost import CatBoostRegressor

iterations=100
learning_rate=0.1
depth=3
random_state=42
verbose=0
thread_count=-1

model_cat = CatBoostRegressor(iterations=iterations, learning_rate=learning_rate, depth=depth, random_state=random_state, verbose=verbose, thread_count=thread_count) # Set verbose to 0 for less output

print("Training CatBoost Regressor model...")
start_time = time.time()
# 4. Train the model
model_cat.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with CatBoost Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_cat = model_cat.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nTest Data Predictions (CatBoost Regressor):")
print(predictions_cat)

## Save submission.csv

In [None]:
for i in range(len(predictions_cat)):
    if predictions_cat[i]<0:
        predictions_cat[i]=-predictions_cat[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_cat})

submission_df.to_csv('CatBoostRegressor.csv', index=False)
print("\nSubmission file 'CatBoostRegressor.csv' created successfully.")