# Setup

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import dagshub

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)
mlflow.set_experiment("LightGBM_Training")

train = pd.read_csv("data/train_merged.csv", parse_dates=["Date"])
test = pd.read_csv("data/test_merged.csv", parse_dates=["Date"])

# Feature Engineering

In [2]:
def create_date_features(df):
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
    df["Day"] = df["Date"].dt.day
    df["IsMonthStart"] = df["Date"].dt.is_month_start.astype(int)
    df["IsMonthEnd"] = df["Date"].dt.is_month_end.astype(int)
    df["DayOfWeek"] = df["Date"].dt.weekday
    return df

train = create_date_features(train)
test = create_date_features(test)

# Feature selection

In [3]:
drop_cols = ['Date', 'Weekly_Sales']
X = train.drop(columns=[col for col in drop_cols if col in train.columns])
y = train["Weekly_Sales"]

corr = train.corr(numeric_only=True)["Weekly_Sales"].abs().sort_values(ascending=False)
keep_features = [col for col in corr.index if col != "Weekly_Sales" and col in X.columns and corr[col] > 0.02]

X = X[keep_features]
test_X = test[X.columns]


In [7]:
test_X.to_csv("data/test_lightGBM.csv", index=False)

# Training and logging helper

In [4]:
from lightgbm import LGBMRegressor

def train_and_log_model(model, model_name, params):
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_type", model_name)
        for k, v in params.items():
            mlflow.log_param(k, v)

        X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=False, test_size=0.2)

        model.fit(X_train, y_train)

        preds = model.predict(X_val)
        mae = mean_absolute_error(y_val, preds)
        mlflow.log_metric("MAE", mae)

        pipeline = Pipeline(steps=[("model", model)])
        registered_model_name = f"LightGBM_Walmart_{model_name}"
        mlflow.sklearn.log_model(
            pipeline,
            artifact_path=model_name,
            registered_model_name=registered_model_name
        )

        print(f"Model MAE: {mae:.2f}")


# Training 

In [5]:
params1 = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "max_depth": 7,
    "num_leaves": 31
}

params2 = {
    "n_estimators": 1500,
    "learning_rate": 0.03,
    "max_depth": 10,
    "num_leaves": 60,
    "colsample_bytree": 0.8,
    "subsample": 0.8
}

params3 = {
    "n_estimators": 1200,
    "learning_rate": 0.03,
    "max_depth": 8,
    "num_leaves": 50,
    "reg_alpha": 0.5,
    "reg_lambda": 1.0,
    "min_child_samples": 30
}

params4 = {
    "n_estimators": 1800,
    "learning_rate": 0.025,
    "max_depth": 12,
    "num_leaves": 70,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "reg_alpha": 0.1,
    "reg_lambda": 0.5,
    "min_child_samples": 20,
    "random_state": 42,
}

params5 = {
    "n_estimators": 900,
    "learning_rate": 0.07,
    "max_depth": 6,
    "num_leaves": 25,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 3,
    "reg_alpha": 0,
    "reg_lambda": 0,
    "min_child_samples": 40,
    "random_state": 42,
}

params6 = {
    "n_estimators": 1400,
    "learning_rate": 0.015,
    "max_depth": 15,
    "num_leaves": 80,
    "feature_fraction": 0.7,
    "bagging_fraction": 1.0,
    "bagging_freq": 0,
    "reg_alpha": 0.3,
    "reg_lambda": 1.5,
    "min_child_samples": 25,
    "random_state": 42,
}

model1 = LGBMRegressor(**params1)
train_and_log_model(model1, "LightGBM_V1", params1)

model2 = LGBMRegressor(**params2)
train_and_log_model(model2, "LightGBM_V2", params2)

model3 = LGBMRegressor(**params3)
train_and_log_model(model3, "LightGBM_V3", params3)

model4 = LGBMRegressor(**params4)
train_and_log_model(model4, "LightGBM_V4", params4)

model5 = LGBMRegressor(**params5)
train_and_log_model(model5, "LightGBM_V5", params5)

model6 = LGBMRegressor(**params6)
train_and_log_model(model6, "LightGBM_V6", params6)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 12
[LightGBM] [Info] Start training from score 16782.304486


Registered model 'LightGBM_Walmart_LightGBM_V1' already exists. Creating a new version of this model...
2025/07/16 13:02:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_Walmart_LightGBM_V1, version 3
Created version '3' of model 'LightGBM_Walmart_LightGBM_V1'.


Model MAE: 6159.55
🏃 View run LightGBM_V1 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0/runs/97628900516e4f29b7cca756a683e4f3
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 12
[LightGBM] [Info] Start training from score 16782.304486


Registered model 'LightGBM_Walmart_LightGBM_V2' already exists. Creating a new version of this model...
2025/07/16 13:03:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_Walmart_LightGBM_V2, version 3
Created version '3' of model 'LightGBM_Walmart_LightGBM_V2'.


Model MAE: 5345.37
🏃 View run LightGBM_V2 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0/runs/2fe3a263b86a4d3a9fcd161b3c96a6d6
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 12
[LightGBM] [Info] Start training from score 16782.304486


Registered model 'LightGBM_Walmart_LightGBM_V3' already exists. Creating a new version of this model...
2025/07/16 13:03:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_Walmart_LightGBM_V3, version 3
Created version '3' of model 'LightGBM_Walmart_LightGBM_V3'.


Model MAE: 6110.37
🏃 View run LightGBM_V3 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0/runs/4d8ae03129794168ba9b9f70610d3609
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 12
[LightGBM] [Info] Start training from score 16782.304486


Registered model 'LightGBM_Walmart_LightGBM_V4' already exists. Creating a new version of this model...
2025/07/16 13:04:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_Walmart_LightGBM_V4, version 2
Created version '2' of model 'LightGBM_Walmart_LightGBM_V4'.


Model MAE: 5224.12
🏃 View run LightGBM_V4 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0/runs/605aa527e98049a7af67217d46cebf78
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 12
[LightGBM] [Info] Start training from score 16782.304486


Successfully registered model 'LightGBM_Walmart_LightGBM_V5'.
2025/07/16 13:04:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_Walmart_LightGBM_V5, version 1
Created version '1' of model 'LightGBM_Walmart_LightGBM_V5'.


Model MAE: 6058.77
🏃 View run LightGBM_V5 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0/runs/e527fce0aaa64e7583a5c260f71ef07a
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 12
[LightGBM] [Info] Start training from score 16782.304486


Successfully registered model 'LightGBM_Walmart_LightGBM_V6'.
2025/07/16 13:05:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LightGBM_Walmart_LightGBM_V6, version 1
Created version '1' of model 'LightGBM_Walmart_LightGBM_V6'.


Model MAE: 5008.50
🏃 View run LightGBM_V6 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0/runs/d4b3848a632147a482bd34fdf5868516
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/0
