In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv("../data/walmart_ml_data.csv")
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,Quarter,Trend,Rolling_mean_5,Rolling_mean_10,Lag_1,Lag_2
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,2010,2,5,1,1,1125378.184,1159822.43,1643690.9,1643690.9
1,10,2010-02-05,2193048.75,0,54.34,2.962,126.442065,9.765,2010,2,5,1,2,1125378.184,1159822.43,1643690.9,1643690.9
2,37,2010-02-05,536006.73,0,45.97,2.572,209.852966,8.554,2010,2,5,1,3,1125378.184,1159822.43,2193048.75,1643690.9
3,17,2010-02-05,789036.02,0,23.11,2.666,126.442065,6.548,2010,2,5,1,4,1125378.184,1159822.43,536006.73,2193048.75
4,30,2010-02-05,465108.52,0,39.05,2.572,210.752605,8.324,2010,2,5,1,5,1125378.184,1159822.43,789036.02,536006.73


In [9]:
X = df.drop(columns=['Date', 'Weekly_Sales'])
y = df['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)

print("Train Test Split Completed!")
print(f"Training Set: {len(X_train)} records")
print(f"Testing Set:  {len(X_test)} records")
print(f"Total Records: {len(df)}")

Train Test Split Completed!
Training Set: 4826 records
Testing Set:  1609 records
Total Records: 6435


In [10]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

def evaluate_log_and_real(y_true_log, y_pred_log):
    # log-space metrics (model-centric)
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    rmse_log = root_mean_squared_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    # real-space metrics (business-centric)
    y_true_real = np.expm1(y_true_log)
    y_pred_real = np.expm1(y_pred_log)

    mae_real = mean_absolute_error(y_true_real, y_pred_real)
    rmse_real = root_mean_squared_error(y_true_real, y_pred_real)

    return mae_log, rmse_log, mae_real, rmse_real, r2

def evaluate(y_true, y_pred):
    MAE = mean_absolute_error(y_true, y_pred)
    RMSE = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return MAE, RMSE, r2

## Bulk training multiple models

Models trained are
1. Linear Regression
2. Light GBM Regressor
3. XGB Regressor
4. Random Forest Regressor

In [11]:
models = {
    "LinearRegression": LinearRegression(),
    "LightGBM": LGBMRegressor(force_row_wise=True),
    "XGB Regressor": XGBRFRegressor(),
    "Random Forest Regressor": RandomForestRegressor()
}

for i in range(len(list(models.values()))):
    model = list(models.values())[i]

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mae, train_rmse, train_r2 = evaluate(y_train, y_train_pred)
    test_mae, test_rmse, test_t2 = evaluate(y_test, y_test_pred)

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- R2 Score: {:.4f}".format(train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- R2 Score: {:.4f}".format(test_t2))
    
    print('='*35)
    print('\n')


LinearRegression
Model performance for Training set
- Root Mean Squared Error: 433883.7255
- Mean Absolute Error: 349396.9955
- R2 Score: 0.4273
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 416855.3518
- Mean Absolute Error: 341187.2975
- R2 Score: 0.3960


[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 4826, number of used features: 15
[LightGBM] [Info] Start training from score 1049130.508026
LightGBM
Model performance for Training set
- Root Mean Squared Error: 75688.8635
- Mean Absolute Error: 52552.8273
- R2 Score: 0.9826
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 178286.4563
- Mean Absolute Error: 109903.0954
- R2 Score: 0.8895


XGB Regressor
Model performance for Training set
- Root Mean Squared Error: 262383.7743
- Mean Absolute Error: 181571.2011
- R2 Score: 0.7906
----------------------------------
Model performance for Test set
- Roo

### Inference from bulk training results

Linear Regression performs the worst with R2 score of 0.42 on test set. The evidently shows it is not able to model the non linear relationships in the data.

Light BGM performs the best on both train and test wih R2 score of **0.98** (train) and **0.88** (test). It generalizes well on unseen data with no evidence of major overfitting. This result is before hyperparameter tuning of the model.

## Hyperparameter Tuning of Light GBM

In [15]:
import optuna
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.metrics import mean_squared_error
import joblib

cv = TimeSeriesSplit(n_splits=5)

def objective(trial):
    parameters = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "max_bin": trial.suggest_int("max_bin", 100, 500),
    }

    model =LGBMRegressor(**parameters)

    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1)

    return -scores.mean()

sampler = TPESampler(seed=42)
pruner = MedianPruner(n_warmup_steps=5)
study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=10, timeout=None, show_progress_bar=True)

print("Best value:", study.best_value)
print("Best params:", study.best_params)

best_model = LGBMRegressor(**study.best_params, random_state=42, n_jobs=-1)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Test RMSE:", rmse)
print("Test R2:", r2)

# save study and model
joblib.dump(study, "../runs/optuna_lgb_study.pkl")
joblib.dump(best_model, "../models/lgb_best_model.pkl")



[I 2026-01-05 16:21:10,158] A new study created in memory with name: no-name-16b0c12a-c9d4-4600-a761-b966c8d2ea5b
Best trial: 0. Best value: 250454:  10%|█         | 1/10 [00:10<01:37, 10.78s/it]

[I 2026-01-05 16:21:20,936] Trial 0 finished with value: 250454.27644742123 and parameters: {'n_estimators': 812, 'num_leaves': 123, 'max_bin': 393}. Best is trial 0 with value: 250454.27644742123.


Best trial: 1. Best value: 239537:  20%|██        | 2/10 [00:18<01:09,  8.70s/it]

[I 2026-01-05 16:21:28,176] Trial 1 finished with value: 239537.26407791692 and parameters: {'n_estimators': 1238, 'num_leaves': 33, 'max_bin': 162}. Best is trial 1 with value: 239537.26407791692.


Best trial: 1. Best value: 239537:  30%|███       | 3/10 [00:23<00:51,  7.37s/it]

[I 2026-01-05 16:21:33,979] Trial 2 finished with value: 257058.0092764317 and parameters: {'n_estimators': 210, 'num_leaves': 113, 'max_bin': 341}. Best is trial 1 with value: 239537.26407791692.


Best trial: 3. Best value: 227357:  40%|████      | 4/10 [00:29<00:39,  6.62s/it]

[I 2026-01-05 16:21:39,435] Trial 3 finished with value: 227357.22964475994 and parameters: {'n_estimators': 1446, 'num_leaves': 18, 'max_bin': 488}. Best is trial 3 with value: 227357.22964475994.


Best trial: 3. Best value: 227357:  50%|█████     | 5/10 [00:33<00:29,  5.91s/it]

[I 2026-01-05 16:21:44,096] Trial 4 finished with value: 239147.6077863528 and parameters: {'n_estimators': 1682, 'num_leaves': 39, 'max_bin': 172}. Best is trial 3 with value: 227357.22964475994.


Best trial: 3. Best value: 227357:  60%|██████    | 6/10 [00:35<00:17,  4.44s/it]

[I 2026-01-05 16:21:45,671] Trial 5 finished with value: 259969.60638597785 and parameters: {'n_estimators': 448, 'num_leaves': 50, 'max_bin': 310}. Best is trial 3 with value: 227357.22964475994.


Best trial: 3. Best value: 227357:  70%|███████   | 7/10 [00:38<00:12,  4.04s/it]

[I 2026-01-05 16:21:48,902] Trial 6 finished with value: 253961.89497752613 and parameters: {'n_estimators': 921, 'num_leaves': 48, 'max_bin': 345}. Best is trial 3 with value: 227357.22964475994.


Best trial: 3. Best value: 227357:  80%|████████  | 8/10 [00:39<00:06,  3.13s/it]

[I 2026-01-05 16:21:50,084] Trial 7 finished with value: 252520.53366302437 and parameters: {'n_estimators': 365, 'num_leaves': 49, 'max_bin': 246}. Best is trial 3 with value: 227357.22964475994.


Best trial: 3. Best value: 227357:  90%|█████████ | 9/10 [00:44<00:03,  3.73s/it]

[I 2026-01-05 16:21:55,128] Trial 8 finished with value: 244954.71365207463 and parameters: {'n_estimators': 966, 'num_leaves': 104, 'max_bin': 180}. Best is trial 3 with value: 227357.22964475994.


Best trial: 3. Best value: 227357: 100%|██████████| 10/10 [00:49<00:00,  4.94s/it]


[I 2026-01-05 16:21:59,517] Trial 9 finished with value: 243157.8969265549 and parameters: {'n_estimators': 1077, 'num_leaves': 82, 'max_bin': 118}. Best is trial 3 with value: 227357.22964475994.
Best value: 227357.22964475994
Best params: {'n_estimators': 1446, 'num_leaves': 18, 'max_bin': 488}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4275
[LightGBM] [Info] Number of data points in the train set: 4826, number of used features: 15
[LightGBM] [Info] Start training from score 1049130.508026
Test RMSE: 166302.53733724545
Test R2: 0.9038688294928182


['lgb_best_model.pkl']

This concludes, Linear Regression performs the worst while the Light GBM and Random Forest performs the best