In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Load data
train_path = "D:\\0-projects\\1- calorie prediction\\Data\\train.csv"
test_path = "D:\\0-projects\\1- calorie prediction\\Data\\test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Encode Sex column
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
test_df['Sex'] = le.transform(test_df['Sex'])

# Feature Engineering
def add_features(df):
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['Duration_x_HR'] = df['Duration'] * df['Heart_Rate']
    return df

train_df = add_features(train_df)
test_df = add_features(test_df)

# Features and target
X = train_df.drop(columns=['id', 'Calories'])
y = np.log1p(train_df['Calories'])  # log1p to avoid log(0)

# Model
model = XGBRegressor(n_estimators=150, learning_rate=0.05, max_depth=6, random_state=42)

# Custom scorer for cross_val_score
def rmsle_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

rmsle_cv = make_scorer(rmsle_scorer, greater_is_better=False)

# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring=rmsle_cv)
print(f"Cross-Validated RMSLE (mean): {-np.mean(cv_scores):.5f}")
print(f"Fold-wise RMSLE: {-cv_scores}")

# Train final model on full data
model.fit(X, y)

# Prepare test data
X_test = test_df.drop(columns=['id'])
test_preds = model.predict(X_test)
test_preds = np.expm1(test_preds)  # Invert log1p

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': test_preds
})
submission_file = "D:\\0-projects\\1- calorie prediction\\submission_cv_feateng.csv"
submission.to_csv(submission_file, index=False)
print(f"✅ Submission saved to: {submission_file}")


Cross-Validated RMSLE (mean): 0.06137
Fold-wise RMSLE: [0.06144635 0.06176392 0.06119345 0.06129091 0.06116859]
✅ Submission saved to: D:\0-projects\1- calorie prediction\submission_cv_feateng.csv


In [2]:
pip install lightgbm


Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --- ------------------------------------ 0.1/1.5 MB 3.3 MB/s eta 0:00:01
   ------------------ --------------------- 0.7/1.5 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 11.5 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install optuna


Collecting optuna
  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/d9/dd/0b593d1a5ee431b33a1fdf4ddb5911c312ed3bb598ef9e17457af2ee7b34/optuna-4.3.0-py3-none-any.whl.metadata
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Obtaining dependency information for alembic>=1.5.0 from https://files.pythonhosted.org/packages/31/59/565286efff3692c5716c212202af61466480f6357c4ae3089d4453bff1f3/alembic-1.16.1-py3-none-any.whl.metadata
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Obtaining dependency information for colorlog from https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl.metadata
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Obtaining dependency information for Mako from https://files.python

In [4]:
# Full pipeline with Optuna tuning, feature engineering, log-transform, and model ensembling

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import optuna

# Load data
train_path = "D:\\0-projects\\1- calorie prediction\\Data\\train.csv"
test_path = "D:\\0-projects\\1- calorie prediction\\Data\\test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Encode 'Sex'
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
test_df['Sex'] = le.transform(test_df['Sex'])

# Feature engineering
def add_features(df):
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['Duration_x_HR'] = df['Duration'] * df['Heart_Rate']
    return df

train_df = add_features(train_df)
test_df = add_features(test_df)

# Define X and log-transformed y
X = train_df.drop(columns=['id', 'Calories'])
y = np.log1p(train_df['Calories'])  # log1p transformation

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Optuna Tuning for XGBoost ---
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": 42
    }
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
    return rmsle

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

best_params = study.best_params
print("Best Parameters from Optuna:", best_params)

# Train best XGBoost
xgb_model = XGBRegressor(**best_params)
xgb_model.fit(X_train, y_train)
xgb_preds = np.expm1(xgb_model.predict(X_val))

# Train LightGBM
lgbm_model = LGBMRegressor(random_state=42)
lgbm_model.fit(X_train, y_train)
lgbm_preds = np.expm1(lgbm_model.predict(X_val))

# Train RandomForest
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = np.expm1(rf_model.predict(X_val))

# Ensemble
final_preds = (xgb_preds + lgbm_preds + rf_preds) / 3
final_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_val), final_preds))
print(f"Ensemble RMSLE on Validation Set: {final_rmsle:.5f}")

# Retrain all on full data
xgb_model.fit(X, y)
lgbm_model.fit(X, y)
rf_model.fit(X, y)

# Prepare test data
X_test = test_df.drop(columns=['id'])
xgb_test = np.expm1(xgb_model.predict(X_test))
lgbm_test = np.expm1(lgbm_model.predict(X_test))
rf_test = np.expm1(rf_model.predict(X_test))
final_test_preds = (xgb_test + lgbm_test + rf_test) / 3

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Calories': final_test_preds
})
submission_file = "D:\\0-projects\\1- calorie prediction\\submission_ensemble_optuna.csv"
submission.to_csv(submission_file, index=False)
print(f"✅ Final ensemble submission saved to: {submission_file}")



[I 2025-05-29 16:23:58,438] A new study created in memory with name: no-name-698c9713-d4b2-4fce-9a37-d071843a9f25
[I 2025-05-29 16:24:09,175] Trial 0 finished with value: 0.05994733463569154 and parameters: {'n_estimators': 250, 'max_depth': 10, 'learning_rate': 0.05302647493479643, 'subsample': 0.693744345176625, 'colsample_bytree': 0.8807753968651582}. Best is trial 0 with value: 0.05994733463569154.
[I 2025-05-29 16:24:12,720] Trial 1 finished with value: 0.06187030351448925 and parameters: {'n_estimators': 194, 'max_depth': 4, 'learning_rate': 0.13985337903947376, 'subsample': 0.9393808692275227, 'colsample_bytree': 0.8313515385425629}. Best is trial 0 with value: 0.05994733463569154.
[I 2025-05-29 16:24:18,169] Trial 2 finished with value: 0.0621593637143689 and parameters: {'n_estimators': 263, 'max_depth': 4, 'learning_rate': 0.05105581105185839, 'subsample': 0.7535740267797422, 'colsample_bytree': 0.8626057299689619}. Best is trial 0 with value: 0.05994733463569154.
[I 2025-05-

Best Parameters from Optuna: {'n_estimators': 173, 'max_depth': 9, 'learning_rate': 0.04882426604751229, 'subsample': 0.9288261932315841, 'colsample_bytree': 0.6753133616826141}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 870
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 9
[LightGBM] [Info] Start training from score 4.141163
Ensemble RMSLE on Validation Set: 0.05941
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 869
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 9
[LightGBM] [Info] Start