In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e5/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")

In [None]:
print(f"Dataset Shape: {train_df.shape}")

print("\nData Info:")
train_df.info()

print("\nNumerical Features Summary:")
display(train_df.describe())

print("\nFirst 10 Rows of the Dataset:")
display(train_df.head(10))

## **Dataset Overview:**

* Shape: 750,000 rows, 9 columns.
* Features:
* id: Unique identifier (int64, likely not useful for modeling).
* Sex: Categorical (object, male/female).
* Age: Numerical (int64, range: 20-79).
* Height: Numerical (float64, range: 126-222 cm).
* Weight: Numerical (float64, range: 36-132 kg).
* Duration: Numerical (float64, range: 1-30 minutes, likely exercise duration).
* Heart_Rate: Numerical (float64, range: 67-128 bpm).
* Body_Temp: Numerical (float64, range: 37.1-41.5°C).
* Target: Calories (float64, range: 1-314 kcal, mean: 88.28).
* Data Types: 6 float64, 2 int64, 1 object (Sex).
* Missing Values: None (all columns have 750,000 non-null entries).

## Let's get some plots:

In [None]:
sns.set_style("whitegrid")
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10
palette = sns.color_palette("coolwarm", as_cmap=True)

In [None]:
# Drop 'id' as it's not useful for EDA
train_df = train_df.drop('id', axis=1)

In [None]:
# Define numerical and categorical columns
num_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
cat_cols = ['Sex']

In [None]:
# 1. Distribution Histograms for Numerical Features
plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols, 1):
    plt.subplot(3, 3, i)
    sns.histplot(train_df[col], kde=True, color='dodgerblue', bins=30)
    plt.title(f'Distribution of {col}', fontweight='bold')
    plt.xlabel(col)
    plt.ylabel('Count')
plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.show()

In [None]:
# 2. Count Plot for Sex
plt.figure(figsize=(6, 4))
sns.countplot(x='Sex', data=train_df, palette='coolwarm')
plt.title('Distribution of Sex', fontweight='bold')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.savefig('sex_distribution.png')
plt.show()

In [None]:
# 3. Correlation Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = train_df[num_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Heatmap of Numerical Features', fontweight='bold')
plt.savefig('correlation_heatmap.png')
plt.show()

In [None]:
# 4. Box Plots for Outlier Detection
plt.figure(figsize=(15, 5))
for i, col in enumerate(['Calories', 'Heart_Rate', 'Duration'], 1):
    plt.subplot(1, 3, i)
    sns.boxplot(y=train_df[col], color='lightcoral')
    plt.title(f'Box Plot of {col}', fontweight='bold')
    plt.ylabel(col)
plt.tight_layout()
plt.savefig('box_plots.png')
plt.show()

In [None]:
# 5. Scatter Plots (Calories vs. Key Features)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.scatterplot(x='Duration', y='Calories', hue='Sex', size='Heart_Rate', data=train_df, palette='coolwarm', alpha=0.6)
plt.title('Calories vs. Duration', fontweight='bold')
plt.subplot(1, 2, 2)
sns.scatterplot(x='Heart_Rate', y='Calories', hue='Sex', size='Duration', data=train_df, palette='coolwarm', alpha=0.6)
plt.title('Calories vs. Heart Rate', fontweight='bold')
plt.tight_layout()
plt.savefig('scatter_plots.png')
plt.show()

In [None]:
# 6. Pair Plot (Subset of Features)
subset_cols = ['Calories', 'Duration', 'Heart_Rate', 'Weight']
sns.pairplot(train_df[subset_cols], diag_kind='kde', plot_kws={'alpha': 0.5, 'color': 'dodgerblue'})
plt.suptitle('Pair Plot of Key Features', fontweight='bold', y=1.02)
plt.savefig('pair_plot.png')
plt.show()

In [None]:
import plotly.express as px

# 7. Interactive Plotly Scatter Plot
fig = px.scatter(
    train_df,
    x='Duration',
    y='Calories',
    color='Sex',
    size='Heart_Rate',
    hover_data=['Age', 'Weight', 'Body_Temp'],
    title='Calories vs. Duration by Sex and Heart Rate',
    color_discrete_map={'male': '#1f77b4', 'female': '#ff7f0e'},  # Professional colors (blue, orange)
    opacity=0.6,
    size_max=15  # Control max bubble size for clarity
)

# Customizing layout for professional look
fig.update_layout(
    title_font=dict(size=18, family='Arial', weight='bold'),
    xaxis_title='Duration (minutes)',
    yaxis_title='Calories (kcal)',
    font=dict(family='Arial', size=12),
    showlegend=True,
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray'),
    legend_title='Sex'
)

# Adding hover template for better readability
fig.update_traces(
    hovertemplate=(
        '<b>Duration</b>: %{x} min<br>' +
        '<b>Calories</b>: %{y} kcal<br>' +
        '<b>Sex</b>: %{customdata[0]}<br>' +
        '<b>Age</b>: %{customdata[1]}<br>' +
        '<b>Weight</b>: %{customdata[2]} kg<br>' +
        '<b>Body Temp</b>: %{customdata[3]} °C<br>' +
        '<b>Heart Rate</b>: %{marker.size} bpm'
    ),
    customdata=train_df[['Sex', 'Age', 'Weight', 'Body_Temp']].values
)

# Saving it as HTML and display
fig.write_html('interactive_scatter.html')
fig.show()

print("Interactive scatter plot saved as 'interactive_scatter.html'")

In [None]:
# Print summary insights
print("\nEDA Insights:")
print("- Numerical features: Most distributions are slightly skewed (e.g., Calories, Duration).")
print("- Sex distribution: Check balance; may influence calorie burn.")
print("- Correlations: Duration, Heart_Rate likely strong predictors of Calories.")
print("- Outliers: Calories, Heart_Rate show potential outliers; consider capping.")
print("- Relationships: Scatter plots suggest linear trends between Calories and Duration/Heart_Rate.")

## Let's get to modelling:

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Verify input columns
required_cols = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
for df, name in [(train_df, 'train'), (test_df, 'test')]:
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in {name}_df: {missing}")
print("Input columns verified.")

# Encode Sex
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1}).astype('int64')

# Handling Outliers based on realistic figures
train_df['Heart_Rate'] = train_df['Heart_Rate'].clip(50, 200)
train_df['Calories'] = train_df['Calories'].clip(0, 500)
train_df['Body_Temp'] = train_df['Body_Temp'].clip(36, 42)
train_df['Duration'] = train_df['Duration'].clip(0, 60)

### Feature Engineering

In [None]:
# Feature Engineering
for df in [train_df, test_df]:
    # Physical Features
    df['BMI'] = df['Weight'] / (df['Height'].clip(lower=1) / 100) ** 2
    df['BMR'] = np.where(
        df['Sex'] == 0,  # Male
        10 * df['Weight'] + 6.25 * df['Height'] - 5 * df['Age'] + 5,
        10 * df['Weight'] + 6.25 * df['Height'] - 5 * df['Age'] - 161  # Female
    )
    # Exercise Intensity
    df['Intensity'] = df['Duration'] * df['Heart_Rate']
    df['METs'] = df['Heart_Rate'] / 10
    # Health Features
    df['Temp_Anomaly'] = df['Body_Temp'] - 37
    df['Age_Group'] = pd.cut(
        df['Age'],
        bins=[20, 30, 40, 50, 60, 80],
        labels=['20-30', '31-40', '41-50', '51-60', '61-80'],
        include_lowest=True
    )
    # Physics-Based Feature (Keytel Equation)
    df['Keytel_Estimate'] = (
        (0.6309 * df['Heart_Rate'] + 0.1988 * df['Weight'] + 0.2017 * df['Age'] - 55.0969) *
        df['Duration'] / 4.184
    )
    # New Interaction Features
    df['HR_Duration'] = df['Heart_Rate'] * df['Duration']
    df['Weight_BMI'] = df['Weight'] * df['BMI']
    df['METs_Duration'] = df['METs'] * df['Duration']

In [None]:
# Verify engineered columns
engineered_cols = ['BMI', 'BMR', 'Intensity', 'METs', 'Temp_Anomaly', 'HR_Duration', 'Weight_BMI', 'METs_Duration']
for df, name in [(train_df, 'train'), (test_df, 'test')]:
    missing = [col for col in engineered_cols if col not in df.columns]
    if missing:
        raise ValueError(f"Engineered columns missing in {name}_df: {missing}")

print("Train columns after feature engineering:", train_df.columns.tolist())
print("Test columns after feature engineering:", test_df.columns.tolist())

In [None]:
# Encode Age_Group (one-hot encoding)
train_df = pd.get_dummies(train_df, columns=['Age_Group'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Age_Group'], drop_first=True)

In [None]:
expected_cols = train_df.columns.drop('Calories')
missing_cols = [col for col in expected_cols if col not in test_df.columns]
if missing_cols:
    for col in missing_cols:
        test_df[col] = 0
test_df = test_df[expected_cols]

In [None]:
# Verify column alignment
print("\nTrain columns (excl. Calories):", expected_cols.tolist())
print("Test columns:", test_df.columns.tolist())
if not test_df.columns.tolist() == expected_cols.tolist():
    raise ValueError("Test columns do not match train columns.")

In [None]:
# Scale Numerical Features
scaler = StandardScaler()
num_cols = [
    'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
    'BMI', 'BMR', 'Intensity', 'METs', 'Temp_Anomaly', 'HR_Duration',
    'Weight_BMI', 'METs_Duration'
]

for df, name in [(train_df, 'train'), (test_df, 'test')]:
    missing = [col for col in num_cols if col not in df.columns]
    if missing:
        raise ValueError(f"Scaling columns missing in {name}_df: {missing}")

train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

In [None]:
# Polynomial Features for Age and Heart_Rate
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_cols = ['Age', 'Heart_Rate']

train_poly = pd.DataFrame(poly.fit_transform(train_df[poly_cols]), columns=[f'poly_{c}' for c in poly.get_feature_names_out(poly_cols)])
test_poly = pd.DataFrame(poly.transform(test_df[poly_cols]), columns=[f'poly_{c}' for c in poly.get_feature_names_out(poly_cols)])

train_df = pd.concat([train_df.drop(poly_cols, axis=1), train_poly], axis=1)
test_df = pd.concat([test_df.drop(poly_cols, axis=1), test_poly], axis=1)

In [None]:
# Ensure all columns are numerical
for df in [train_df, test_df]:
    for col in df.columns:
        if col != 'Calories':
            df[col] = df[col].astype('float64')

# Verify dtypes
print("\nTrain dtypes:\n", train_df.dtypes)
print("\nTest dtypes:\n", test_df.dtypes)

In [None]:
# Verify no missing values
print("\nTrain Missing Values:\n", train_df.isnull().sum())
print("\nTest Missing Values:\n", test_df.isnull().sum())

# Display shapes
print(f"\nTrain Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")

In [None]:
# Save preprocessed data
train_df.to_csv('preprocessed_train.csv', index=False)
test_df.to_csv('preprocessed_test.csv', index=False)
joblib.dump(scaler, 'scaler.pkl')

print("Preprocessed data and scaler saved.")

### Let's get regressor working:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

In [None]:
# Load preprocessed data
train_df = pd.read_csv('preprocessed_train.csv')
test_df = pd.read_csv('preprocessed_test.csv')

# Load test IDs
test_ids = pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")['id']

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

In [None]:
# Split features and target
X = train_df.drop('Calories', axis=1)
y = train_df['Calories']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# XGBoost with Optuna Tuning
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
best_params = study.best_params
print("Best XGBoost Parameters:", best_params)

In [None]:
# Train XGBoost with best parameters
xgb_model = XGBRegressor(**best_params, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
xgb_mse = mean_squared_error(y_val, y_pred_xgb)
xgb_mae = mean_absolute_error(y_val, y_pred_xgb)
xgb_r2 = r2_score(y_val, y_pred_xgb)

print(f"XGBoost - MSE: {xgb_mse:.2f}, MAE: {xgb_mae:.2f}, R²: {xgb_r2:.2f}")

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score

# Cross-Validation to Check Overfitting
xgb_model = XGBRegressor(**best_params, random_state=42, n_jobs=-1)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_mse = -cross_val_score(xgb_model, X, y, cv=kf, scoring='neg_mean_squared_error')
cv_mae = -cross_val_score(xgb_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(xgb_model, X, y, cv=kf, scoring='r2')

print("\n5-Fold Cross-Validation Metrics:")
print(f"MSE: {cv_mse.mean():.2f} (+/- {cv_mse.std() * 2:.2f})")
print(f"MAE: {cv_mae.mean():.2f} (+/- {cv_mae.std() * 2:.2f})")
print(f"R²: {cv_r2.mean():.2f} (+/- {cv_r2.std() * 2:.2f})")

In [None]:
# Train final model on full training data
xgb_model.fit(X, y)

In [None]:
# Validate on hold-out set (for reference)
y_pred_val = xgb_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
print(f"\nValidation Metrics (Hold-Out):")
print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

In [None]:
# Residual Plot to Inspect Errors
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_val, y=y_val - y_pred_val)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Actual Calories (kcal)')
plt.ylabel('Residuals')
plt.title('Residual Plot', fontweight='bold')
plt.savefig('residual_plot.png')
plt.show()

In [None]:
# Predict on test set
test_predictions = xgb_model.predict(test_df)

# Clip predictions to reasonable range (based on train data)
test_predictions = np.clip(test_predictions, 0, 500)

In [None]:
# SHAP Feature Importance
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_val)
shap.summary_plot(shap_values, X_val, plot_type="bar", show=False)
plt.title('SHAP Feature Importance', fontweight='bold')
plt.savefig('shap_feature_importance.png')
plt.show()

In [None]:
submission = pd.DataFrame({'id': test_ids, 'Calories': test_predictions})
submission.to_csv('submission.csv', index=False)

In [None]:
print("\nSubmission file created: 'submission.csv'")
submission.head()