# Demand Forecasting with LightGBM & Optuna

This notebook trains a LightGBM model to predict demand, optimized using Optuna.

In [2]:
!pip install optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import pickle

%matplotlib inline

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [7]:
# Load Data
df = pd.read_csv('./Dynamic-Pricing/data/sales_data.csv')

# Feature Engineering
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday

# Encode Categoricals
cat_cols = ['Store ID', 'Product ID', 'Category', 'Region', 'Weather Condition', 'Seasonality', 'Promotion']
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# Define Features and Target
# We include Inventory Level as per RPD
features = ['Store ID', 'Product ID', 'Category', 'Region', 'Inventory Level',
            'Price', 'Discount', 'Weather Condition', 'Promotion',
            'Competitor Pricing', 'Seasonality', 'Epidemic', 'Month', 'Day', 'Weekday']
target = 'Demand'

X = df[features]
y = df[target]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Optuna Hyperparameter Tuning

In [9]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),  # Reduced upper bound
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 7),      # Reduced upper bound
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100), # Increased lower bound
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 1e-8, 0.1, log=True),
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]) # Early stopping added
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

# Run Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30) # Increased trials for better search with more params

print('Best trial:', study.best_trial.params)

[I 2025-11-27 06:57:38,895] A new study created in memory with name: no-name-1d0690bb-1156-46ca-acf2-23677cf9ca9d
[I 2025-11-27 06:57:48,070] Trial 0 finished with value: 10.522913102435256 and parameters: {'num_leaves': 53, 'learning_rate': 0.1159936482320049, 'n_estimators': 617, 'max_depth': 7, 'min_child_samples': 69, 'subsample': 0.7005390311587113, 'colsample_bytree': 0.7696175298615422, 'lambda_l1': 5.190352095688252e-05, 'lambda_l2': 7.05857565018527e-08, 'min_gain_to_split': 6.195526215332507e-05}. Best is trial 0 with value: 10.522913102435256.
[I 2025-11-27 06:57:53,502] Trial 1 finished with value: 14.549120244729606 and parameters: {'num_leaves': 33, 'learning_rate': 0.22412995796839252, 'n_estimators': 730, 'max_depth': 4, 'min_child_samples': 20, 'subsample': 0.95465708538808, 'colsample_bytree': 0.5693318210599521, 'lambda_l1': 1.929419130595072e-07, 'lambda_l2': 5.743500777748627e-07, 'min_gain_to_split': 0.060532490583730815}. Best is trial 0 with value: 10.5229131024

Best trial: {'num_leaves': 62, 'learning_rate': 0.19262964685474476, 'n_estimators': 991, 'max_depth': 7, 'min_child_samples': 28, 'subsample': 0.7989217603734662, 'colsample_bytree': 0.7845896519997099, 'lambda_l1': 0.002042592105482397, 'lambda_l2': 0.00039961411906077417, 'min_gain_to_split': 0.0010962485070004965}


In [10]:
# def objective(trial):
#     params = {
#         'objective': 'regression',
#         'metric': 'rmse',
#         'verbosity': -1,
#         'boosting_type': 'gbdt',
#         'num_leaves': trial.suggest_int('num_leaves', 20, 150),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 5, 15),
#         'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#     }

#     model = lgb.LGBMRegressor(**params)
#     model.fit(X_train, y_train)
#     preds = model.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(y_test, preds))
#     return rmse

# # Run Optimization
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20) # 20 trials for speed, increase for better results

# print('Best trial:', study.best_trial.params)

## Train Best Model

In [12]:
best_params = study.best_trial.params
best_params['objective'] = 'regression'
best_params['metric'] = 'rmse'

final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
y_pred_train = final_model.predict(X_train)

print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Test MAE:", mean_absolute_error(y_test, y_pred))
print("Test R2:", r2_score(y_test, y_pred))

print("\nTrain RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))
print("Train R2:", r2_score(y_train, y_pred_train))

Test RMSE: 7.243309139613637
Test MAE: 4.772434998443257
Test R2: 0.976243987765607

Train RMSE: 3.7729675441230794
Train R2: 0.9935439437501788


In [11]:
best_params = study.best_trial.params
best_params['objective'] = 'regression'
best_params['metric'] = 'rmse'

final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)

print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

RMSE: 7.243309139613637
MAE: 4.772434998443257
R2: 0.976243987765607


## Save Model and Encoders

In [None]:
with open('demand_model_lgbm.pkl', 'wb') as f:
    pickle.dump(final_model, f)

with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(le_dict, f)

print("Model and encoders saved.")

Model and encoders saved.


In [4]:
!git clone https://github.com/PraveenDevamane/Dynamic-Pricing.git

Cloning into 'Dynamic-Pricing'...
remote: Enumerating objects: 103, done.[K
remote: Counting objects: 100% (103/103), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 103 (delta 19), reused 86 (delta 11), pack-reused 0 (from 0)[K
Receiving objects: 100% (103/103), 9.80 MiB | 18.27 MiB/s, done.
Resolving deltas: 100% (19/19), done.
