In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from itertools import product
import matplotlib.pyplot as plt
import joblib as jl
import os

# Feature Engineering (lag & seasonality creation)
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from prophet import Prophet

# Modelling
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# _______LightGBM_________
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error

#_________Shap Values_________
import shap

In [3]:
df_final_result=jl.load("C:/Users/LENOVO/Desktop/Sales Forecasing/data/processed/df_final_result.pkl")
df_final_result.head(3)

Unnamed: 0,week_id,shop_id,item_category_id,date,item_category_name,item_cnt_day,item_price,Revenue,B_lag1,B_lag2,...,EWMA2,EWMA3,EWMA4,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
0,2013-W00,25,23,2013-01-05,Games - XBOX 360,53.0,1952.7375,106697.0,0.0,0.0,...,106697.0,106697.0,106697.0,5,1,1,2013,5,5,1
1,2013-W00,25,30,2013-01-03,PC Games - Standard Editions,198.0,454.997792,92870.5,0.0,0.0,...,92870.5,92870.5,92870.5,3,1,1,2013,3,3,1
2,2013-W00,25,40,2013-01-02,Cinema - DVD,520.0,250.888889,144028.0,0.0,0.0,...,144028.0,144028.0,144028.0,2,1,1,2013,2,2,1


In [9]:
df_final_result["weekofyear"] = df_final_result["weekofyear"].astype(int)

In [4]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0  # handle division by zero
    return 100 * np.mean(diff)

In [10]:
# Spliting the data

cut_off_date = pd.to_datetime(
    (df_final_result["date"].drop_duplicates().quantile(0.85)).normalize()
)
train = df_final_result[df_final_result["date"] <= cut_off_date]
test = df_final_result[df_final_result["date"] > cut_off_date]
X_train = train.drop(
    columns=[
        "date",
        "week_id",
        "Revenue",
        "item_category_name",
        "EWMA1",
        "EWMA2",
        "EWMA3",
        "EWMA4",
    ]
)
y_train = train["Revenue"]
X_test = test.drop(
    columns=[
        "date",
        "week_id",
        "Revenue",
        "item_category_name",
        "EWMA1",
        "EWMA2",
        "EWMA3",
        "EWMA4",
    ]
)
y_test = test["Revenue"]

In [12]:


eval_results = {}

# Model
model = lgb.LGBMRegressor(
    objective="regression",
    n_estimators=5000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
)

# Train
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_names=["valid"],
    eval_metric="l2",
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50),
        lgb.record_evaluation(eval_results),
    ],
)

# Plot with plotly: filled line (like a progressive fill)
eval_l2 = eval_results["valid"]["l2"]
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(eval_l2))),
        y=eval_l2,
        fill="tozeroy",
        mode="lines",
        name="Validation L2 Loss",
        line=dict(color="royalblue"),
    )
)

fig.update_layout(
    title="LightGBM Evaluation - Progressively Filled Curve",
    xaxis_title="Iteration",
    yaxis_title="L2 Loss",
    template="plotly_white",
)

fig.show()

# Prediction & R2
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
r2_score1 = r2_score(y_test, y_pred)
print(f"R2_socre:{r2_score1:2f}")
smape_score = smape(y_test.values, y_pred)
print(f"SMAPE: {smape_score:.2f}%")

Training until validation scores don't improve for 50 rounds
[50]	valid's l2: 8.55031e+07
[100]	valid's l2: 5.07022e+07
[150]	valid's l2: 4.72917e+07
[200]	valid's l2: 4.72563e+07
Early stopping, best iteration is:
[169]	valid's l2: 4.64063e+07


RMSE: 6812.21
MAE: 4056.81
R2_socre:0.885760
SMAPE: 10.15%


## Shap Values

In [16]:
import shap

# Compute the SHAP Values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(
    X_test
)  # Tell which features were the most important in the prediction
shap_df = pd.DataFrame(shap_values, columns=X_test.columns)

# Plot SHAP Values
shap_importance = shap_df.mean().sort_values(
    ascending=False
)  # .sort_values(ascending=False).abs()
fig = px.bar(
    shap_importance,
    orientation="h",
    title="SHAP Feature Importance (Mean Absolute Value)",
    labels={"value": "Mean |SHAP value|", "index": "Feature"},
)
fig.update_layout(template="plotly_white")
fig.show()

In [20]:
# Spliting the data
selected_cols = [
    "B_lag1",
    #"B_lag3",
    #"shop_id",
    "item_category_id",
    "item_cnt_day",
    "Rolling_mean2",
    "Rolling_mean1",
    "Rolling_mean3",
    "item_price",
    "dayofyear",
    #"dayofweek",
    "weekofyear",
]

cut_off_date = pd.to_datetime(
    (df_final_result["date"].drop_duplicates().quantile(0.85)).normalize()
)
train = df_final_result[df_final_result["date"] <= cut_off_date]
test = df_final_result[df_final_result["date"] > cut_off_date]
X_train = train[selected_cols]
y_train = train["Revenue"]
X_test = test[selected_cols]
y_test = test["Revenue"]


from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error

eval_results = {}

# Model
model = lgb.LGBMRegressor(
    objective="regression",
    n_estimators=5000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
)

# Train
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_names=["valid"],
    eval_metric="l2",
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50),
        lgb.record_evaluation(eval_results),
    ],
)

# Plot with plotly: filled line (like a progressive fill)
eval_l2 = eval_results["valid"]["l2"]
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(eval_l2))),
        y=eval_l2,
        fill="tozeroy",
        mode="lines",
        name="Validation L2 Loss",
        line=dict(color="royalblue"),
    )
)

fig.update_layout(
    title="LightGBM Evaluation - Progressively Filled Curve",
    xaxis_title="Iteration",
    yaxis_title="L2 Loss",
    template="plotly_white",
)

fig.show()

# Prediction & R2
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
r2_score1 = r2_score(y_test, y_pred)
print(r2_score1)
smape_score = smape(y_test.values, y_pred)
print(f"SMAPE: {smape_score:.2f}%")

Training until validation scores don't improve for 50 rounds
[50]	valid's l2: 8.58618e+07
[100]	valid's l2: 4.73728e+07
[150]	valid's l2: 4.38011e+07
[200]	valid's l2: 4.11045e+07
[250]	valid's l2: 4.03838e+07
[300]	valid's l2: 3.98004e+07
[350]	valid's l2: 3.95414e+07
[400]	valid's l2: 3.87438e+07
[450]	valid's l2: 3.8746e+07
Early stopping, best iteration is:
[446]	valid's l2: 3.84854e+07


RMSE: 6203.66
MAE: 3723.39
0.9052594345496503
SMAPE: 9.41%


In [None]:

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(
    X_test
) 
shap_df = pd.DataFrame(shap_values, columns=X_test.columns)

# Plot SHAP Values
shap_importance = shap_df.mean().sort_values(
    ascending=False
)  # .sort_values(ascending=False).abs()
fig = px.bar(
    shap_importance,
    orientation="h",
    title="SHAP Feature Importance (Mean Absolute Value)",
    labels={"value": "Mean |SHAP value|", "index": "Feature"},
)
fig.update_layout(template="plotly_white")
fig.show()

In [22]:
# Selecting Data to plot
plot_data = test[
    [
        "week_id",
        "shop_id",
        "item_category_id",
        "date",
        "item_category_name",
        "Revenue",
    ]
].copy()
plot_data["y_pred"] = y_pred.copy()
plot_data

shops = plot_data["shop_id"].unique()
categories = plot_data["item_category_name"].unique()
#Plotting the results
for shop, cat in product(shops, categories):
    plt_data = plot_data[
        (plot_data["shop_id"] == shop)
        & (plot_data["item_category_name"] == cat)
    ]
    plt_data_grouped = plt_data.groupby("week_id", as_index=False).agg(
        {"date": "first", "Revenue": "mean", "y_pred": "mean"}
    )
    fig = px.line(plt_data_grouped, x="date", y="Revenue")
    fig.update_traces(name="Revenue")  # Correct way to set the name
    fig.add_scatter(
        x=plt_data_grouped["date"],
        y=plt_data_grouped["y_pred"],
        name="PREDICTED Revenue",
    )
    fig.update_layout(
        title=f"the shop: {shop} and category: {cat}",
        xaxis_title="date",
        yaxis_title="Revenue",
    )
    fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [27]:
# Save the model for deployement
features = df_final_result.drop(
    columns=[
        "date",
        "week_id",
        "Revenue",
        "item_category_name",
        "EWMA1",
        "EWMA2",
        "EWMA3",
        "EWMA4",
    ]
)
joblib.dump(model, "lightGBM_model.pkl")
joblib.dump(list(features.columns), "features.pkl")

['features.pkl']