In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np


from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import joblib

from utils.preprocess import add_time_features, add_season

# Load dataset
df = pd.read_csv("../datasets/main/store_item_demand.csv", parse_dates=["date"])

# Apply preprocessing
df = add_time_features(df)
df = add_season(df)

df.head()


Unnamed: 0,date,store,item,sales,year,month,day,dayofweek,month_sin,month_cos,season
0,2013-01-01,1,1,13,2013,1,1,1,0.5,0.866025,Winter
1,2013-01-02,1,1,11,2013,1,2,2,0.5,0.866025,Winter
2,2013-01-03,1,1,14,2013,1,3,3,0.5,0.866025,Winter
3,2013-01-04,1,1,13,2013,1,4,4,0.5,0.866025,Winter
4,2013-01-05,1,1,10,2013,1,5,5,0.5,0.866025,Winter


In [None]:

# Cell 2 — Feature selection

# Assumes Cell 1 ran and `df` already has time features + season (add_time_features, add_season)

# Choose features that are available from Streamlit inputs and that capture seasonality
feature_cols = [
    "store",        # integer id (keep as int)
    "item",         # integer id (keep as int)
    "year",         # numeric
    "month",        # numeric (we also have month_sin/month_cos)
    "dayofweek",    # numeric
    "month_sin",    # cyclic feature
    "month_cos"     # cyclic feature
]

# Target
target_col = "sales"

# Quick sanity checks
print("Available features in df:", [c for c in feature_cols if c in df.columns])
print("Missing features (if any):", [c for c in feature_cols if c not in df.columns])

# Subset X, y
X = df[feature_cols].copy()
y = df[target_col].copy()

# Ensure dtypes are numeric (XGBoost prefers numeric dtypes; using float32 is efficient)
X = X.astype({
    "store": "int32",
    "item": "int32",
    "year": "int32",
    "month": "int32",
    "dayofweek": "int32",
    "month_sin": "float32",
    "month_cos": "float32"
})

y = y.astype("float32")

# Quick summary
print("\nX shape:", X.shape)
print("y shape:", y.shape)
print("\nSample of features:")
display(X.head())

# Show value-counts for store/item to understand cardinality (important for model design)
print("\nUnique stores:", X['store'].nunique(), " — example stores:", sorted(X['store'].unique())[:10])
print("Unique items:", X['item'].nunique(), " — example items:", sorted(X['item'].unique())[:10])


Available features in df: ['store', 'item', 'year', 'month', 'dayofweek', 'month_sin', 'month_cos']
Missing features (if any): []

X shape: (913000, 7)
y shape: (913000,)

Sample of features:


Unnamed: 0,store,item,year,month,dayofweek,month_sin,month_cos
0,1,1,2013,1,1,0.5,0.866025
1,1,1,2013,1,2,0.5,0.866025
2,1,1,2013,1,3,0.5,0.866025
3,1,1,2013,1,4,0.5,0.866025
4,1,1,2013,1,5,0.5,0.866025



Unique stores: 10  — example stores: [np.int32(1), np.int32(2), np.int32(3), np.int32(4), np.int32(5), np.int32(6), np.int32(7), np.int32(8), np.int32(9), np.int32(10)]
Unique items: 50  — example items: [np.int32(1), np.int32(2), np.int32(3), np.int32(4), np.int32(5), np.int32(6), np.int32(7), np.int32(8), np.int32(9), np.int32(10)]


In [None]:

# Cell 3 — Train/Test Split


from sklearn.model_selection import train_test_split

# 80% training, 20% testing is ideal for this dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print("Training X shape:", X_train.shape)
print("Testing X shape:", X_test.shape)
print("Training y shape:", y_train.shape)
print("Testing y shape:", y_test.shape)


Training X shape: (730400, 7)
Testing X shape: (182600, 7)
Training y shape: (730400,)
Testing y shape: (182600,)


In [None]:

# Cell 4 — Train XGBoost Model

from xgboost import XGBRegressor

# Optimized baseline configuration (fast + accurate)
xgb_model = XGBRegressor(
    n_estimators=300,          # number of trees
    learning_rate=0.05,        # smaller = more stable learning
    max_depth=8,               # tree depth
    subsample=0.8,             # prevents overfitting
    colsample_bytree=0.8,      # feature sampling per tree
    random_state=42,
    n_jobs=-1,                 # use all CPU cores
    objective='reg:squarederror'  # regression task
)

print("Training XGBoost model... this may take ~20–40 seconds depending on CPU.")
xgb_model.fit(X_train, y_train)

print("XGBoost training completed successfully!")


Training XGBoost model... this may take ~20–40 seconds depending on CPU.
XGBoost training completed successfully!


In [None]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predictions
y_pred = xgb_model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Evaluation Metrics:")
print(f"MAE  (Mean Absolute Error):       {mae:.4f}")
print(f"RMSE (Root Mean Squared Error):   {rmse:.4f}")
print(f"R² Score (Regression Accuracy):   {r2:.4f}")


Model Evaluation Metrics:
MAE  (Mean Absolute Error):       5.8175
RMSE (Root Mean Squared Error):   7.5700
R² Score (Regression Accuracy):   0.9308


In [None]:

# Cell 6 — Save XGBoost Model

# Path to save the model
model_path = "../models/xgb_demand_model.json"

# Save model in XGBoost native JSON format
xgb_model.save_model(model_path)

print(f"Model saved successfully to: {model_path}")


Model saved successfully to: ../models/xgb_demand_model.json


In [None]:

# Cell 7 — Fixed Prediction Pipeline (Booster-based)


import xgboost as xgb
from utils.preprocess import add_time_features
import pandas as pd
import numpy as np

def load_model(path="../models/xgb_demand_model.json"):
    model = xgb.Booster()
    model.load_model(path)
    return model

def prepare_input(store, item, date):
    df = pd.DataFrame({
        "date": [pd.to_datetime(date)],
        "store": [store],
        "item": [item]
    })

    df = add_time_features(df)

    feature_cols = [
        "store", "item",
        "year", "month", "dayofweek",
        "month_sin", "month_cos"
    ]

    df = df[feature_cols].astype("float32")

    return df, feature_cols

def predict_demand(model, store, item, date):
    df, feature_cols = prepare_input(store, item, date)

    # Convert to DMatrix for Booster prediction
    dmatrix = xgb.DMatrix(df)

    pred = model.predict(dmatrix)[0]
    return round(float(pred), 2)

print("XGBoost Booster pipeline ready.")


XGBoost Booster pipeline ready.


In [None]:

# Cell 8 — Test Prediction Pipeline


# Load the saved model
model = load_model("../models/xgb_demand_model.json")

# Test example
test_store = 1
test_item = 1
test_date = "2017-01-01"

pred = predict_demand(model, test_store, test_item, test_date)

print(f"Test Prediction for Store {test_store}, Item {test_item}, Date {test_date}:")
print("Predicted Sales =", pred)


Test Prediction for Store 1, Item 1, Date 2017-01-01:
Predicted Sales = 18.67


In [None]:

# Cell 10 — End-to-End System Test

from utils.model_utils import load_model, predict_demand

model = load_model()

print("Testing final pipeline...")

print("Prediction:", predict_demand(model, 3, 12, "2017-01-15"))
print("Notebook 02 Completed Successfully!")


Testing final pipeline...
Prediction: 72.26
Notebook 02 Completed Successfully!
