In [None]:
!python -V

# Import lib and Check Input

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data

In [None]:
train_path = "/kaggle/input/playground-series-s5e5/train.csv"
test_path = "/kaggle/input/playground-series-s5e5/test.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Observe Data

In [None]:
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape:  {test_df.shape}")

In [None]:
print(train_df.describe())

In [None]:
print(test_df.describe())

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
test_df.head()

In [None]:
test_df.tail()

## Store test ids

In [None]:
test_ids = test_df['id']

In [None]:
test_ids

## Visualize Data

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Sex', data=train_df)
plt.title('Sex Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Sex', y='Calories', data=train_df)
plt.title('Sex Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Age'].dropna(), kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Height'].dropna(), kde=True)
plt.title('Height Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Weight'].dropna(), kde=True)
plt.title('Weight Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Duration'].dropna(), kde=True)
plt.title('Duration Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Heart_Rate'].dropna(), kde=True)
plt.title('Heart_Rate Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Body_Temp'].dropna(), kde=True)
plt.title('Body_Temp Distribution')
plt.show()

# Data Pre-Processing

## Encoding Categorical Variables

In [None]:
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

## Check Data

In [None]:
train_df.head()

In [None]:
test_df.head()

## Check For Missing Values

In [None]:
missing_age_values_train = train_df['Age'].isnull().sum()
missing_age_values_test = test_df['Age'].isnull().sum()

# Print the number of missing values
print(f"Number of missing values in the 'Age' column of train.csv: {missing_age_values_train}")
print(f"Number of missing values in the 'Age' column of test.csv: {missing_age_values_test}")

In [None]:
missing_values_train = train_df['Height'].isnull().sum()
missing_values_test = test_df['Height'].isnull().sum()

# Print the number of missing values
print(f"Number of missing values in the 'Height' column of train.csv: {missing_values_train}")
print(f"Number of missing values in the 'Height' column of test.csv: {missing_values_test}")

In [None]:
missing_values_train = train_df['Weight'].isnull().sum()
missing_values_test = test_df['Weight'].isnull().sum()

# Print the number of missing values
print(f"Number of missing values in the 'Weight' column of train.csv: {missing_values_train}")
print(f"Number of missing values in the 'Weight' column of test.csv: {missing_values_test}")

In [None]:
missing_values_train = train_df['Duration'].isnull().sum()
missing_values_test = test_df['Duration'].isnull().sum()

# Print the number of missing values
print(f"Number of missing values in the 'Duration' column of train.csv: {missing_values_train}")
print(f"Number of missing values in the 'Duration' column of test.csv: {missing_values_test}")

In [None]:
missing_values_train = train_df['Heart_Rate'].isnull().sum()
missing_values_test = test_df['Heart_Rate'].isnull().sum()

# Print the number of missing values
print(f"Number of missing values in the 'Heart_Rate' column of train.csv: {missing_values_train}")
print(f"Number of missing values in the 'Heart_Rate' column of test.csv: {missing_values_test}")

In [None]:
missing_values_train = train_df['Body_Temp'].isnull().sum()
missing_values_test = test_df['Body_Temp'].isnull().sum()

# Print the number of missing values
print(f"Number of missing values in the 'Body_Temp' column of train.csv: {missing_values_train}")
print(f"Number of missing values in the 'Body_Temp' column of test.csv: {missing_values_test}")

In [None]:
missing_values_train = train_df['Calories'].isnull().sum()
print(f"Number of missing values in the 'Calories' column of train.csv: {missing_values_train}")

## Feature Engineering

In [None]:
train_df['BMI'] = train_df['Weight'] / (train_df['Height']/100)**2
test_df['BMI'] = test_df['Weight'] / (test_df['Height']/100)**2

In [None]:
train_df['Duration_HR'] = train_df['Duration'] * train_df['Heart_Rate']
test_df['Duration_HR'] = test_df['Duration'] * test_df['Heart_Rate']

A feature like Duration^2 x Heart_Rate to capture potentially accelerating calorie burn with concurrent high values of these strongly correlated features.

In [None]:
train_df['Duration2_HR'] = (train_df['Duration'])**2 * train_df['Heart_Rate']
test_df['Duration2_HR'] = (test_df['Duration'])**2 * test_df['Heart_Rate']

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df['Body_Temp'] = train_df['Body_Temp'] - 37.0
test_df['Body_Temp'] = test_df['Body_Temp'] - 37.0

In [None]:
train_df.head()

In [None]:
test_df.head()

## Scaling Numeric Features

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# # train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']] = scaler.fit_transform(train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']])
# # test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']] = scaler.transform(test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']])

# train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']] = scaler.fit_transform(train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']])
# test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']] = scaler.transform(test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']])

# train_df[['Age', 'BMI', 'Duration', 'Heart_Rate', 'Body_Temp']] = scaler.fit_transform(train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']])
# test_df[['Age', 'BMI', 'Duration', 'Heart_Rate', 'Body_Temp']] = scaler.transform(test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']])

# train_df[['Age', 'BMI', 'Duration_HR', 'Body_Temp']] = scaler.fit_transform(train_df[['Age', 'BMI', 'Duration_HR', 'Body_Temp']])
# test_df[['Age', 'BMI', 'Duration_HR', 'Body_Temp']] = scaler.transform(test_df[['Age', 'BMI', 'Duration_HR', 'Body_Temp']])

train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'Duration_HR', "Duration2_HR"]] = scaler.fit_transform(train_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'Duration_HR', "Duration2_HR"]])
test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'Duration_HR', 'Duration2_HR']] = scaler.transform(test_df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'Duration_HR', "Duration2_HR"]])

# Model

# 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

## Features

In [None]:
# features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
# features = ['Age', 'BMI', 'Duration', 'Heart_Rate', 'Body_Temp']
# features = ['Age', 'BMI', 'Duration_HR', 'Body_Temp']
features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'Duration_HR', "Duration2_HR"]

## Train Preparation

In [None]:
X_train = train_df[features]
y_train = train_df['Calories']

In [None]:
X_test = test_df[features]

In [None]:
X_train = X_train.fillna(0) # Fill NaNs in training features
X_test = X_test.fillna(0) # Fill NaNs in test features

## K Fold CV

In [None]:
from sklearn.model_selection import KFold  # For creating K-Fold cross-validation splits
from sklearn.metrics import mean_squared_log_error # For calculating Mean Squared Logarithmic Error

In [None]:
kfold_summary = []

## 5 Fold CV

In [None]:
# This KFold CV block assumes X_train and y_train are already defined and preprocessed
# as they are in your script before the final model training:
# - X_train: pandas DataFrame containing the selected features.
#            It is assumed to have been derived from a globally scaled train_df
#            and to have had NaNs filled (e.g., with X_train.fillna(0)).
# - y_train: pandas Series containing the target variable 'Calories'.
#            'Calories' must be non-negative for RMSLE. Your data exploration
#            (train_df.describe()) should confirm this (min value >= 0).

# --- START OF KFold CV CODE BLOCK (using RMSLE) ---

In [None]:
print(f"--- Preparing for K-Fold Cross-Validation with RMSLE ---")
# 1. Configure K-Fold Cross-Validation
n_splits = 5  # Define the number of folds (k). 5 or 10 are common choices. More folds reduce bias in the performance estimate but increase variance and computational time.
shuffle = True
random_state=42
kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
                # Initialize KFold object.
                # n_splits: Specifies the number of folds.
                # shuffle=True: Shuffles the data randomly before splitting into folds. This is crucial
                #               to ensure that folds are representative of the overall data, especially
                #               if the data has some inherent ordering.
                # random_state=42: Using a fixed random state ensures that the shuffle operation is
                #                  the same every time the code runs. This makes the CV results reproducible.

In [None]:
# 2. Prepare to store results from each fold
fold_rmsle_scores = [] # List to store the Root Mean Squared Logarithmic Error (RMSLE) for each validation fold.
                       # This helps in understanding the model's performance consistency using RMSLE.
oof_predictions = np.zeros(X_train.shape[0]) # Array to store out-of-fold (OOF) predictions.
                                             # OOF predictions are made on data that the model (for that fold)
                                             # was not trained on. The full array of OOF predictions can be
                                             # used for a more robust single validation score or for ensembling.

In [None]:
print(f"--- Starting {n_splits}-Fold Cross-Validation for Linear Regression (evaluating with RMSLE) ---\n\n")

# 3. Iterate through each fold
# The kf.split(X_train, y_train) method generates pairs of indices:
# - train_idx: Indices for the data points to be used for training in the current fold.
# - val_idx: Indices for the data points to be used for validation in the current fold.
# 'enumerate' is used to get both the fold number (starting from 0) and the indices.
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n  Processing Fold {fold + 1}/{n_splits}...")

    # 3.1. Split data into training and validation sets for the current fold
    # .iloc is used to select rows from pandas DataFrames (X_train) and Series (y_train) based on integer indices.
    X_train_fold = X_train.iloc[train_idx]  # Features for training in this specific fold.
    y_train_fold = y_train.iloc[train_idx]  # Target variable for training in this specific fold.
    X_val_fold = X_train.iloc[val_idx]    # Features for validation in this specific fold.
    y_val_fold = y_train.iloc[val_idx]    # Target variable for validation in this specific fold.
                                          # For RMSLE, y_val_fold must be non-negative.

    # --- Note on Preprocessing (Scaling/Imputation) within CV ---
    # As in the previous RMSE version, this CV block uses the already-globally-scaled `X_train`.
    # For a more rigorous CV, scaling should be fit on X_train_fold and transformed on both.
    # NaNs are assumed to be handled prior to this block (`X_train = X_train.fillna(0)`).


    
    start_time = time.time()  # Record start time
    # 3.2. Initialize a new model instance for each fold
    # It's important to create a new, untrained model instance for each fold.
    model_fold = LinearRegression()
    end_time = time.time()  # Record end time
    print(f"Initialize a new model instance for each fold.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    
    start_time = time.time()  # Record start time
    # 3.3. Train the model on the current fold's training data
    model_fold.fit(X_train_fold, y_train_fold)
    print(f"    Model trained for Fold {fold + 1}.")
    end_time = time.time()  # Record end time
    print(f"Train the model on the current fold's training data.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    
    start_time = time.time()  # Record start time
    # 3.4. Make predictions on the current fold's validation data
    val_preds = model_fold.predict(X_val_fold)
    print(f"    Predictions made for Fold {fold + 1}.")
    end_time = time.time()  # Record end time
    print(f"Make predictions on the current fold's validation data.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    # 3.5. Handle negative predictions (CRUCIAL for RMSLE)
    # RMSLE (specifically, mean_squared_log_error) requires non-negative inputs for both true and predicted values.
    # Your original script converts negative predictions by taking their absolute value.
    # This is important here to avoid errors with `log(1 + pred)` if `pred` is too negative.
    # We ensure predictions are non-negative.
    val_preds[val_preds < 0] = -val_preds[val_preds < 0] # Makes them positive
    # If any true values in y_val_fold could be negative, they would also need to be clipped to 0 or handled.
    # However, 'Calories' should naturally be non-negative.

    # 3.6. Store out-of-fold (OOF) predictions
    # The (non-negatively adjusted) predictions for the validation set of the current fold
    # are stored in the corresponding positions of the oof_predictions array.
    oof_predictions[val_idx] = val_preds

    # 3.7. Evaluate the model's performance on the validation set for this fold using RMSLE
    # `mean_squared_log_error` calculates MSLE. We take its square root to get RMSLE.
    # Both y_val_fold (true values) and val_preds (predictions) must be non-negative.
    try:
        fold_rmsle = np.sqrt(mean_squared_log_error(y_val_fold, val_preds))
        fold_rmsle_scores.append(fold_rmsle) # Store the RMSLE for this fold.
        print(f"    Fold {fold + 1} RMSLE: {fold_rmsle:.4f}")
    except ValueError as e:
        # This might happen if, despite efforts, negative values sneak into y_val_fold or val_preds.
        # Or if y_val_fold contains values that are problematic for log(1+y).
        print(f"    Error calculating RMSLE for Fold {fold + 1}: {e}")
        print(f"    Min y_val_fold: {y_val_fold.min()}, Min val_preds: {np.min(val_preds)}")
        # Add a placeholder or skip this fold's score if an error occurs.
        # For robustness, you might add a large penalty value or handle it as per your strategy.
        fold_rmsle_scores.append(np.nan) # Or some other indicator of failure

In [None]:
# 4. Summarize Cross-Validation Results
# After iterating through all folds, calculate the average and standard deviation of the RMSLE scores.
# Filter out NaNs if any occurred during RMSLE calculation.
valid_fold_rmsle_scores = [s for s in fold_rmsle_scores if not np.isnan(s)]
if valid_fold_rmsle_scores:
    mean_cv_rmsle = np.mean(valid_fold_rmsle_scores) # Average RMSLE provides an estimate of performance.
    std_cv_rmsle = np.std(valid_fold_rmsle_scores)   # Standard deviation indicates consistency.
    print(f"--- Cross-Validation Summary (RMSLE) ---")
    kfold_summary.append("--- Cross-Validation Summary (RMSLE) ---")
    print(f"Average RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {mean_cv_rmsle:.4f}")
    print(f"Standard Deviation of RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {std_cv_rmsle:.4f}")
    
    txt1 = f"Average RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {mean_cv_rmsle:.4f}"
    txt2 = f"Standard Deviation of RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {std_cv_rmsle:.4f}"
    kfold_summary.append(txt1)
    kfold_summary.append(txt2)
else:
    print(f"--- Cross-Validation Summary (RMSLE) ---")
    print(f"RMSLE calculation failed for all folds.")


# (Optional) Calculate overall OOF RMSLE using all out-of-fold predictions
# This provides a single RMSLE score for the entire training dataset.
# Ensure y_train and oof_predictions are non-negative.
# oof_predictions were already adjusted. y_train (Calories) should be non-negative.
if y_train.min() >= 0 and oof_predictions.min() >= 0 and valid_fold_rmsle_scores:
    try:
        overall_oof_rmsle = np.sqrt(mean_squared_log_error(y_train, oof_predictions))
        print(f"Overall OOF RMSLE (from concatenated fold predictions): {overall_oof_rmsle:.4f}")
        
        txt3 = f"Overall OOF RMSLE (from concatenated fold predictions): {overall_oof_rmsle:.4f}"
        kfold_summary.append(txt3)
    except ValueError as e:
        print(f"Error calculating Overall OOF RMSLE: {e}")
        print(f"Min y_train: {y_train.min()}, Min oof_predictions: {oof_predictions.min()}")
else:
    if y_train.min() < 0:
        print("Cannot calculate Overall OOF RMSLE: y_train contains negative values.")
    if oof_predictions.min() < 0: # Should not happen due to adjustment
        print("Cannot calculate Overall OOF RMSLE: oof_predictions contain negative values.")
    if not valid_fold_rmsle_scores:
        print("Cannot calculate Overall OOF RMSLE: No valid fold scores were obtained.")

# --- END OF KFold CV CODE BLOCK (using RMSLE) ---

# Following this CV block, your script would typically proceed to train the final
# LinearRegression model on the *entire* X_train and y_train dataset.
# The CV results (especially mean_cv_rmsle) help you to gauge how well this
# final model is likely to perform on the actual test data, when evaluated with RMSLE.
# Remember to apply the same non-negative adjustment to your final test predictions
# if you were to evaluate them with RMSLE.

## 10 Fold CV

In [None]:
# 1. Configure K-Fold Cross-Validation
n_splits = 10
shuffle = True
random_state=42
print(f"--- Preparing for K-Fold Cross-Validation with RMSLE ---")
kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

In [None]:
# 2. Prepare to store results from each fold
fold_rmsle_scores = []
oof_predictions = np.zeros(X_train.shape[0])

print(f"--- Starting {n_splits}-Fold Cross-Validation for Linear Regression (evaluating with RMSLE) ---\n\n")

# 3. Iterate through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n  Processing Fold {fold + 1}/{n_splits}...")
    X_train_fold = X_train.iloc[train_idx]  # Features for training in this specific fold.
    y_train_fold = y_train.iloc[train_idx]  # Target variable for training in this specific fold.
    X_val_fold = X_train.iloc[val_idx]    # Features for validation in this specific fold.
    y_val_fold = y_train.iloc[val_idx]
    
    start_time = time.time()  # Record start time
    # 3.2. Initialize a new model instance for each fold
    # It's important to create a new, untrained model instance for each fold.
    model_fold = LinearRegression()
    end_time = time.time()  # Record end time
    print(f"Initialize a new model instance for each fold.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    start_time = time.time()  # Record start time
    # 3.3. Train the model on the current fold's training data
    model_fold.fit(X_train_fold, y_train_fold)
    print(f"    Model trained for Fold {fold + 1}.")
    end_time = time.time()  # Record end time
    print(f"Train the model on the current fold's training data.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    start_time = time.time()  # Record start time
    # 3.4. Make predictions on the current fold's validation data
    val_preds = model_fold.predict(X_val_fold)
    print(f"    Predictions made for Fold {fold + 1}.")
    end_time = time.time()  # Record end time
    print(f"Make predictions on the current fold's validation data.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    # 3.5. Handle negative predictions (CRUCIAL for RMSLE)
    val_preds[val_preds < 0] = -val_preds[val_preds < 0]

    # 3.6. Store out-of-fold (OOF) predictions
    oof_predictions[val_idx] = val_preds

    # 3.7. Evaluate the model's performance on the validation set for this fold using RMSLE
    try:
        fold_rmsle = np.sqrt(mean_squared_log_error(y_val_fold, val_preds))
        fold_rmsle_scores.append(fold_rmsle) # Store the RMSLE for this fold.
        print(f"    Fold {fold + 1} RMSLE: {fold_rmsle:.4f}")
    except ValueError as e:
        print(f"    Error calculating RMSLE for Fold {fold + 1}: {e}")
        print(f"    Min y_val_fold: {y_val_fold.min()}, Min val_preds: {np.min(val_preds)}")
        fold_rmsle_scores.append(np.nan) # Or some other indicator of failure

# 4. Summarize Cross-Validation Results
valid_fold_rmsle_scores = [s for s in fold_rmsle_scores if not np.isnan(s)]
if valid_fold_rmsle_scores:
    mean_cv_rmsle = np.mean(valid_fold_rmsle_scores) # Average RMSLE provides an estimate of performance.
    std_cv_rmsle = np.std(valid_fold_rmsle_scores)   # Standard deviation indicates consistency.
    print(f"--- Cross-Validation Summary (RMSLE) ---")
    print(f"Average RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {mean_cv_rmsle:.4f}")
    print(f"Standard Deviation of RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {std_cv_rmsle:.4f}")
else:
    print(f"--- Cross-Validation Summary (RMSLE) ---")
    print(f"RMSLE calculation failed for all folds.")

if y_train.min() >= 0 and oof_predictions.min() >= 0 and valid_fold_rmsle_scores:
    try:
        overall_oof_rmsle = np.sqrt(mean_squared_log_error(y_train, oof_predictions))
        print(f"Overall OOF RMSLE (from concatenated fold predictions): {overall_oof_rmsle:.4f}")
        
        txt3 = f"Overall OOF RMSLE (from concatenated fold predictions): {overall_oof_rmsle:.4f}"
        kfold_summary.append(txt3)
    except ValueError as e:
        print(f"Error calculating Overall OOF RMSLE: {e}")
        print(f"Min y_train: {y_train.min()}, Min oof_predictions: {oof_predictions.min()}")
else:
    if y_train.min() < 0:
        print("Cannot calculate Overall OOF RMSLE: y_train contains negative values.")
    if oof_predictions.min() < 0: # Should not happen due to adjustment
        print("Cannot calculate Overall OOF RMSLE: oof_predictions contain negative values.")
    if not valid_fold_rmsle_scores:
        print("Cannot calculate Overall OOF RMSLE: No valid fold scores were obtained.")

# --- END OF KFold CV CODE BLOCK (using RMSLE) ---

## 15 Fold CV

In [None]:
# 1. Configure K-Fold Cross-Validation
n_splits = 15
shuffle = True
random_state=42
print(f"--- Preparing for K-Fold Cross-Validation with RMSLE ---")
kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

In [None]:
# 2. Prepare to store results from each fold
fold_rmsle_scores = []
oof_predictions = np.zeros(X_train.shape[0])

print(f"--- Starting {n_splits}-Fold Cross-Validation for Linear Regression (evaluating with RMSLE) ---\n\n")

# 3. Iterate through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n  Processing Fold {fold + 1}/{n_splits}...")
    X_train_fold = X_train.iloc[train_idx]  # Features for training in this specific fold.
    y_train_fold = y_train.iloc[train_idx]  # Target variable for training in this specific fold.
    X_val_fold = X_train.iloc[val_idx]    # Features for validation in this specific fold.
    y_val_fold = y_train.iloc[val_idx]
    
    start_time = time.time()  # Record start time
    # 3.2. Initialize a new model instance for each fold
    # It's important to create a new, untrained model instance for each fold.
    model_fold = LinearRegression()
    end_time = time.time()  # Record end time
    print(f"Initialize a new model instance for each fold.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    start_time = time.time()  # Record start time
    # 3.3. Train the model on the current fold's training data
    model_fold.fit(X_train_fold, y_train_fold)
    print(f"    Model trained for Fold {fold + 1}.")
    end_time = time.time()  # Record end time
    print(f"Train the model on the current fold's training data.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    start_time = time.time()  # Record start time
    # 3.4. Make predictions on the current fold's validation data
    val_preds = model_fold.predict(X_val_fold)
    print(f"    Predictions made for Fold {fold + 1}.")
    end_time = time.time()  # Record end time
    print(f"Make predictions on the current fold's validation data.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

    # 3.5. Handle negative predictions (CRUCIAL for RMSLE)
    val_preds[val_preds < 0] = -val_preds[val_preds < 0]

    # 3.6. Store out-of-fold (OOF) predictions
    oof_predictions[val_idx] = val_preds

    # 3.7. Evaluate the model's performance on the validation set for this fold using RMSLE
    try:
        fold_rmsle = np.sqrt(mean_squared_log_error(y_val_fold, val_preds))
        fold_rmsle_scores.append(fold_rmsle) # Store the RMSLE for this fold.
        print(f"    Fold {fold + 1} RMSLE: {fold_rmsle:.4f}")
    except ValueError as e:
        print(f"    Error calculating RMSLE for Fold {fold + 1}: {e}")
        print(f"    Min y_val_fold: {y_val_fold.min()}, Min val_preds: {np.min(val_preds)}")
        fold_rmsle_scores.append(np.nan) # Or some other indicator of failure

# 4. Summarize Cross-Validation Results
valid_fold_rmsle_scores = [s for s in fold_rmsle_scores if not np.isnan(s)]
if valid_fold_rmsle_scores:
    mean_cv_rmsle = np.mean(valid_fold_rmsle_scores) # Average RMSLE provides an estimate of performance.
    std_cv_rmsle = np.std(valid_fold_rmsle_scores)   # Standard deviation indicates consistency.
    print(f"--- Cross-Validation Summary (RMSLE) ---")
    print(f"Average RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {mean_cv_rmsle:.4f}")
    print(f"Standard Deviation of RMSLE across {len(valid_fold_rmsle_scores)} valid folds: {std_cv_rmsle:.4f}")
else:
    print(f"--- Cross-Validation Summary (RMSLE) ---")
    print(f"RMSLE calculation failed for all folds.")

if y_train.min() >= 0 and oof_predictions.min() >= 0 and valid_fold_rmsle_scores:
    try:
        overall_oof_rmsle = np.sqrt(mean_squared_log_error(y_train, oof_predictions))
        print(f"Overall OOF RMSLE (from concatenated fold predictions): {overall_oof_rmsle:.4f}")
        
        txt3 = f"Overall OOF RMSLE (from concatenated fold predictions): {overall_oof_rmsle:.4f}"
        kfold_summary.append(txt3)
    except ValueError as e:
        print(f"Error calculating Overall OOF RMSLE: {e}")
        print(f"Min y_train: {y_train.min()}, Min oof_predictions: {oof_predictions.min()}")
else:
    if y_train.min() < 0:
        print("Cannot calculate Overall OOF RMSLE: y_train contains negative values.")
    if oof_predictions.min() < 0: # Should not happen due to adjustment
        print("Cannot calculate Overall OOF RMSLE: oof_predictions contain negative values.")
    if not valid_fold_rmsle_scores:
        print("Cannot calculate Overall OOF RMSLE: No valid fold scores were obtained.")

# --- END OF KFold CV CODE BLOCK (using RMSLE) ---

## KFold CV Summary

In [None]:
for i in kfold_summary:
    print(i)

## Initialize model

In [None]:
%%time
model = LinearRegression()

## Train

In [None]:
%%time
print("Training Linear Regression model...")
model.fit(X_train, y_train)
print("Training complete.")

## Predict

In [None]:
%%time
predictions = model.predict(X_test)

## Handle negative predictions

In [None]:
for i in range(len(predictions)):
    if predictions[i]<0:
        # print(f"i : {i} ;\t; predictions[{i}] : {predictions[i]}")
        predictions[i]=-predictions[i]

## Save CSV

In [None]:
submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions})

In [None]:
submission_df.to_csv('linear_regression_submission.csv', index=False)
print("\nSubmission file 'linear_regression_submission.csv' created successfully.")

## Model Coefficients

In [None]:
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef:.6f}")

In [None]:
import time

for i in range(10):
    start_time = time.time()  # Record start time
    print(i)
    end_time = time.time()  # Record end time
    print(f"Time taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

# 2. Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

test_size=0.2
random_state=42

model_dt = DecisionTreeRegressor(random_state=random_state)

print("Training Decision Tree Regressor model...")
# 4. Train the model
model_dt.fit(X_train, y_train)
print("Training complete.")

print("Making predictions with Decision Tree Regressor...")
# 5. Make predictions on the test data
predictions_dt = model_dt.predict(X_test)
print("Predictions made.")

# 6. Print the predictions
print("\nTest Data Predictions:")
print(predictions_dt)

## Save submission.csv

In [None]:
for i in range(len(predictions_dt)):
    if predictions_dt[i]<0:
        # print(f"i : {i} ;\t; predictions[{i}] : {predictions[i]}")
        predictions_dt[i]=-predictions_dt[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_dt})

submission_df.to_csv('Decision_Tree_Regressor.csv', index=False)
print("\nSubmission file 'Decision_Tree_Regressor.csv' created successfully.")

# 3. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

n_estimators=100
random_state=42

# 3. Initialize the Random Forest Regressor model
# model_rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
model_rf = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
# n_jobs=-1 uses all available CPU cores for parallel processing, often speeding up training.

start_time = time.time()  # Record start time
print("Training Random Forest Regressor model...")
# 4. Train the model
model_rf.fit(X_train, y_train)
print("Training complete.")
end_time = time.time()  # Record end time
print(f"Training Random Forest Regressor model.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

start_time = time.time()  # Record start time
print("Making predictions with Random Forest Regressor...")
# 5. Make predictions on the test data
predictions_rf = model_rf.predict(X_test)
print("Predictions made.")
end_time = time.time()  # Record end time
print(f"Making predictions with Random Forest Regressor.\tTime taken: {(end_time - start_time) * 1000:.3f} ms")  # Print time in milliseconds

# 6. Print the predictions
print("\nSample Test Data Predictions:")
print(predictions_rf)

## Save submission.csv

In [None]:
for i in range(len(predictions_rf)):
    if predictions_rf[i]<0:
        predictions_rf[i]=-predictions_rf[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_rf})

submission_df.to_csv('Random_Forest_Regressor.csv', index=False)
print("\nSubmission file 'Random_Forest_Regressor.csv' created successfully.")

In [None]:
print(model_rf.feature_importances_)

# 4. Gradient Boosting Regressor (from scikit-learn)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

n_estimators=100
learning_rate=0.1
max_depth=3
random_state=42

model_gbr = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)

print("Training Gradient Boosting Regressor model...")
start_time = time.time()
# 4. Train the model
model_gbr.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with Gradient Boosting Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_gbr = model_gbr.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nSample Test Data Predictions (Gradient Boosting Regressor):")
print(predictions_gbr)

## Save submission.csv

In [None]:
for i in range(len(predictions_gbr)):
    if predictions_gbr[i]<0:
        predictions_gbr[i]=-predictions_gbr[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_gbr})

submission_df.to_csv('Gradient_Boosting_Regressor.csv', index=False)
print("\nSubmission file 'Gradient_Boosting_Regressor.csv' created successfully.")

# 4. XGBoost Regressor

In [None]:
import xgboost as xgb

# # 1. Create sample data
# data = {'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#         'Feature2': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
#         'Target': [100, 120, 130, 150, 160, 170, 180, 190, 200, 210]}
# df = pd.DataFrame(data)

# # Define features (X) and target (y)
# X = df[['Feature1', 'Feature2']]
# y = df['Target']

# # 2. Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize the XGBoost Regressor model
# Key parameters for potential optimization:
# n_estimators: Number of boosting rounds.
# learning_rate: Step size shrinkage used in update to prevent overfitting.
# max_depth: Maximum depth of a tree.
# n_jobs: Number of parallel threads. Use -1 to use all available cores.
# tree_method: Algorithm used to construct the trees ('auto', 'exact', 'approx', 'hist'). 'hist' is often faster for large datasets.

n_estimators=100
learning_rate=0.1
max_depth=3
random_state=42
n_jobs=-1
model_xgb = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state, n_jobs=n_jobs)

print("Training XGBoost Regressor model...")
start_time = time.time()
# 4. Train the model
model_xgb.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with XGBoost Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_xgb = model_xgb.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nSample Test Data Predictions (XGBoost Regressor):")
print(predictions_xgb)

## Save submission.csv

In [None]:
for i in range(len(predictions_xgb)):
    if predictions_xgb[i]<0:
        predictions_xgb[i]=-predictions_xgb[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_xgb})

submission_df.to_csv('xgboost.csv', index=False)
print("\nSubmission file 'xgboost.csv' created successfully.")

# 5. LightGBM Regressor

In [None]:
import lightgbm as lgb

# 3. Initialize the LightGBM Regressor model
# LightGBM is often faster than XGBoost and scikit-learn's GBR, especially on large datasets.
# Key parameters for potential optimization:
# n_estimators: Number of boosting rounds.
# learning_rate: Step size shrinkage.
# num_leaves: Maximum number of leaves in one tree (main complexity parameter).
# n_jobs: Number of parallel threads. Use -1 to use all available cores.

n_estimators=100
learning_rate=0.1
# max_depth=3
num_leaves=31
random_state=42
n_jobs=-1

model_lgb = lgb.LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves, random_state=random_state, n_jobs=n_jobs)

print("Training LightGBM Regressor model...")
start_time = time.time()
# 4. Train the model
model_lgb.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with LightGBM Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_lgb = model_lgb.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nSample Test Data Predictions (LightGBM Regressor):")
print(predictions_lgb)

## Save submission.csv

In [None]:
for i in range(len(predictions_lgb)):
    if predictions_lgb[i]<0:
        predictions_lgb[i]=-predictions_lgb[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_lgb})

submission_df.to_csv('LightGBM.csv', index=False)
print("\nSubmission file 'LightGBM.csv' created successfully.")

# 6. CatBoost Regressor

In [None]:
from catboost import CatBoostRegressor

# 3. Initialize the CatBoost Regressor model
# CatBoost is known for handling categorical features well and often provides good out-of-the-box results.
# Key parameters for potential optimization:
# iterations: Number of boosting iterations (trees).
# learning_rate: Step size shrinkage.
# depth: Depth of the trees.
# l2_leaf_reg: L2 regularization term on weights.
# verbose: Controls the amount of output during training (set to 0 for less output).
# thread_count: Number of parallel threads. Use -1 to use all available cores.

iterations=100
learning_rate=0.1
depth=3
random_state=42
verbose=0
thread_count=-1

model_cat = CatBoostRegressor(iterations=iterations, learning_rate=learning_rate, depth=depth, random_state=random_state, verbose=verbose, thread_count=thread_count) # Set verbose to 0 for less output

print("Training CatBoost Regressor model...")
start_time = time.time()
# 4. Train the model
model_cat.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.4f} seconds.")

print("Making predictions with CatBoost Regressor...")
start_time = time.time()
# 5. Make predictions on the test data
predictions_cat = model_cat.predict(X_test)
end_time = time.time()
print(f"Predictions made in {end_time - start_time:.4f} seconds.")

# 6. Print the predictions
print("\nTest Data Predictions (CatBoost Regressor):")
print(predictions_cat)

## Save submission.csv

In [None]:
for i in range(len(predictions_cat)):
    if predictions_cat[i]<0:
        predictions_cat[i]=-predictions_cat[i]

submission_df = pd.DataFrame({'id': test_ids, 'Calories': predictions_cat})

submission_df.to_csv('CatBoostRegressor.csv', index=False)
print("\nSubmission file 'CatBoostRegressor.csv' created successfully.")