In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from itertools import product
import matplotlib.pyplot as plt
import joblib as jl
import os

# Feature Engineering (lag & seasonality creation)
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from prophet import Prophet

# Modelling
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# _______LightGBM_________
import lightgbm as lgb

# ______XgBoost___________
import xgboost as xgb

In [7]:
df_final=jl.load("C:/Users/LENOVO/Desktop/Sales Forecasing/data/processed/df_final.pkl")
df_final.head(3)

Unnamed: 0,week_id,shop_id,item_category_id,date,item_category_name,item_cnt_day,item_price,Revenue
0,2013-W00,25,23,2013-01-05,Games - XBOX 360,53.0,1952.7375,106697.0
1,2013-W00,25,30,2013-01-03,PC Games - Standard Editions,198.0,454.997792,92870.5
2,2013-W00,25,40,2013-01-02,Cinema - DVD,520.0,250.888889,144028.0


In [11]:
# Basic lag
df_final["B_lag1"] = (
    df_final.groupby(["shop_id", "item_category_id"], as_index=False)[
        "Revenue"
    ]
    .shift(1)
    .fillna(0)
)
df_final["B_lag2"] = (
    df_final.groupby(["shop_id", "item_category_id"], as_index=False)[
        "Revenue"
    ]
    .shift(2)
    .fillna(0)
)
df_final["B_lag3"] = (
    df_final.groupby(["shop_id", "item_category_id"], as_index=False)[
        "Revenue"
    ]
    .shift(3)
    .fillna(0)
)
df_final.head(10)

Unnamed: 0,week_id,shop_id,item_category_id,date,item_category_name,item_cnt_day,item_price,Revenue,B_lag1,B_lag2,B_lag3
0,2013-W00,25,23,2013-01-05,Games - XBOX 360,53.0,1952.7375,106697.0,0.0,0.0,0.0
1,2013-W00,25,30,2013-01-03,PC Games - Standard Editions,198.0,454.997792,92870.5,0.0,0.0,0.0
2,2013-W00,25,40,2013-01-02,Cinema - DVD,520.0,250.888889,144028.0,0.0,0.0,0.0
3,2013-W00,25,55,2013-01-02,Music - CD of local production,145.0,302.689394,43942.0,0.0,0.0,0.0
4,2013-W00,28,23,2013-01-04,Games - XBOX 360,62.0,1897.60034,118336.5,0.0,0.0,0.0
5,2013-W00,28,30,2013-01-05,PC Games - Standard Editions,262.0,424.004849,121664.2,0.0,0.0,0.0
6,2013-W00,28,40,2013-01-04,Cinema - DVD,687.0,275.606157,212991.91,0.0,0.0,0.0
7,2013-W00,28,55,2013-01-02,Music - CD of local production,212.0,275.645503,59374.0,0.0,0.0,0.0
8,2013-W00,31,23,2013-01-03,Games - XBOX 360,43.0,1837.466622,81161.14,0.0,0.0,0.0
9,2013-W00,31,30,2013-01-03,PC Games - Standard Editions,231.0,434.289659,109668.24,0.0,0.0,0.0


In [13]:
shop_id = 25
category_id = 40

df_plot = df_final[
    (df_final["shop_id"] == shop_id)
    & (df_final["item_category_id"] == category_id)
].copy()


df_plot = df_plot.sort_values(by="week_id")

# Create line chart
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["Revenue"],
        mode="lines+markers",
        name="Revenue",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["B_lag1"],
        mode="lines+markers",
        name="Lag 1",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["B_lag2"],
        mode="lines+markers",
        name="Lag 2",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["B_lag3"],
        mode="lines+markers",
        name="Lag 3",
    )
)

fig.update_layout(
    title=f"Revenue and Lag Features (Shop {shop_id}, Category {category_id})",
    xaxis_title="Week",
    yaxis_title="Revenue",
    template="plotly_dark",
    width=1000,
)

fig.show()

Rolling Features ( mean and STD)

Useful to capture trend and volatility will tell the the model if the trend is rising, or failling And Will tell the model if the it's stable or volatile

In [15]:
df_final["Rolling_mean1"] = df_final.groupby(
    ["shop_id", "item_category_id"], as_index=False
)["Revenue"].transform(lambda x: x.shift(1).rolling(window=3).mean().fillna(0))
df_final["Rolling_mean2"] = df_final.groupby(
    ["shop_id", "item_category_id"], as_index=False
)["Revenue"].transform(lambda x: x.shift(2).rolling(window=3).mean().fillna(0))
df_final["Rolling_mean3"] = df_final.groupby(
    ["shop_id", "item_category_id"], as_index=False
)["Revenue"].transform(lambda x: x.shift(3).rolling(window=3).mean().fillna(0))
df_final["Rolling_std1"] = df_final.groupby(
    ["shop_id", "item_category_id"], as_index=False
)["Revenue"].transform(lambda x: x.shift(1).rolling(window=3).std().fillna(0))
df_final["Rolling_std2"] = df_final.groupby(
    ["shop_id", "item_category_id"], as_index=False
)["Revenue"].transform(lambda x: x.shift(2).rolling(window=3).std().fillna(0))
df_final["Rolling_std3"] = df_final.groupby(
    ["shop_id", "item_category_id"], as_index=False
)["Revenue"].transform(lambda x: x.shift(3).rolling(window=3).std().fillna(0))
df_final.head(1)

Unnamed: 0,week_id,shop_id,item_category_id,date,item_category_name,item_cnt_day,item_price,Revenue,B_lag1,B_lag2,B_lag3,Rolling_mean1,Rolling_mean2,Rolling_mean3,Rolling_std1,Rolling_std2,Rolling_std3
0,2013-W00,25,23,2013-01-05,Games - XBOX 360,53.0,1952.7375,106697.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
shop_id = 25
category_id = 40

df_plot = df_final[
    (df_final["shop_id"] == shop_id)
    & (df_final["item_category_id"] == category_id)
].copy()


df_plot = df_plot.sort_values(by="week_id")

# Create line chart
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["Revenue"],
        mode="lines+markers",
        name="Revenue",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["Rolling_mean1"],
        mode="lines+markers",
        name="Rolling_mean1",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["Rolling_mean2"],
        mode="lines+markers",
        name="Rolling_mean2",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_plot["week_id"],
        y=df_plot["Rolling_mean3"],
        mode="lines+markers",
        name="Rolling_mean3",
    )
)

fig.update_layout(
    title=f"Revenue and Lag Features (Shop {shop_id}, Category {category_id})",
    xaxis_title="Week",
    yaxis_title="Revenue",
    template="plotly_dark",
    width=1000,
)

fig.show()

In [19]:
df_final1 = df_final.copy()
weeks = [1, 2, 3, 4]
for w in weeks:
    df_final[f"EWMA{w}"] = df_final.groupby(
        ["shop_id", "item_category_id"], as_index=False
    )["Revenue"].transform(lambda x: x.ewm(span=w, adjust=False).mean())
df_final.head(1)

Unnamed: 0,week_id,shop_id,item_category_id,date,item_category_name,item_cnt_day,item_price,Revenue,B_lag1,B_lag2,...,Rolling_mean1,Rolling_mean2,Rolling_mean3,Rolling_std1,Rolling_std2,Rolling_std3,EWMA1,EWMA2,EWMA3,EWMA4
0,2013-W00,25,23,2013-01-05,Games - XBOX 360,53.0,1952.7375,106697.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,106697.0,106697.0,106697.0,106697.0


In [20]:
cols_to_plot = ["EWMA1", "EWMA2", "EWMA3", "EWMA4"]
shop_id = 25
category_id = 40
df_plotting = df_final[
    (df_final["shop_id"] == shop_id)
    & (df_final["item_category_id"] == category_id)
].copy()
df_plotting = df_plotting.sort_values(by="week_id", ascending=True)
fig = go.Figure()
for col in cols_to_plot:
    fig.add_trace(
        go.Scatter(
            x=df_plotting["week_id"],
            y=df_plotting[col],
            mode="lines",
            name=f"{col}",
        )
    )
    fig.update_layout(
        title=f"Revenue and Lag Features (Shop {shop_id}, Category {category_id})",
        xaxis_title="Week",
        yaxis_title="Revenue",
        # template="plotly_dark",
        width=1000,
    )
fig.add_trace(
    go.Scatter(
        x=df_plotting["week_id"],
        y=df_plotting["Revenue"],
        mode="lines",
        name=f"Revenue",
        line=dict(color="black", dash="dash"),
    )
)
fig.show()

# Adding Seasonality, Trend and holiday variables - Prophet

In [22]:
# Holidays that potentially have in impact on sales --> Revenue

holidays = pd.DataFrame(
    [
        # 🌍 Global Holidays & Shopping Events
        {
            "holiday": "new_year",
            "ds": "2013-01-01",
            "lower_window": -3,
            "upper_window": 5,
        },
        {
            "holiday": "valentines_day",
            "ds": "2013-02-14",
            "lower_window": -7,
            "upper_window": 2,
        },
        {
            "holiday": "easter",
            "ds": "2013-03-31",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "black_friday",
            "ds": "2013-11-29",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "cyber_monday",
            "ds": "2013-12-02",
            "lower_window": -2,
            "upper_window": 5,
        },
        {
            "holiday": "christmas",
            "ds": "2013-12-25",
            "lower_window": -14,
            "upper_window": 7,
        },
        {
            "holiday": "new_year",
            "ds": "2014-01-01",
            "lower_window": -3,
            "upper_window": 5,
        },
        {
            "holiday": "valentines_day",
            "ds": "2014-02-14",
            "lower_window": -7,
            "upper_window": 2,
        },
        {
            "holiday": "easter",
            "ds": "2014-04-20",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "black_friday",
            "ds": "2014-11-28",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "cyber_monday",
            "ds": "2014-12-01",
            "lower_window": -2,
            "upper_window": 5,
        },
        {
            "holiday": "christmas",
            "ds": "2014-12-25",
            "lower_window": -14,
            "upper_window": 7,
        },
        {
            "holiday": "new_year",
            "ds": "2015-01-01",
            "lower_window": -3,
            "upper_window": 5,
        },
        {
            "holiday": "valentines_day",
            "ds": "2015-02-14",
            "lower_window": -7,
            "upper_window": 2,
        },
        {
            "holiday": "easter",
            "ds": "2015-04-05",
            "lower_window": -7,
            "upper_window": 7,
        },
        # 🎮 Gaming Expos
        {
            "holiday": "e3",
            "ds": "2013-06-11",
            "lower_window": -14,
            "upper_window": 7,
        },
        {
            "holiday": "gamescom",
            "ds": "2013-08-21",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "tgs",
            "ds": "2013-09-19",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "e3",
            "ds": "2014-06-10",
            "lower_window": -14,
            "upper_window": 7,
        },
        {
            "holiday": "gamescom",
            "ds": "2014-08-13",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "tgs",
            "ds": "2014-09-18",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "e3",
            "ds": "2015-06-16",
            "lower_window": -14,
            "upper_window": 7,
        },
        {
            "holiday": "gamescom",
            "ds": "2015-08-05",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "tgs",
            "ds": "2015-09-17",
            "lower_window": -7,
            "upper_window": 7,
        },
        # 🎵 Music Festivals & Awards
        {
            "holiday": "coachella",
            "ds": "2013-04-12",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "tomorrowland",
            "ds": "2013-07-26",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "grammys",
            "ds": "2013-02-10",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "coachella",
            "ds": "2014-04-11",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "tomorrowland",
            "ds": "2014-07-18",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "grammys",
            "ds": "2014-01-26",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "coachella",
            "ds": "2015-04-10",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "tomorrowland",
            "ds": "2015-07-24",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "grammys",
            "ds": "2015-02-08",
            "lower_window": -7,
            "upper_window": 7,
        },
        # 🎬 Film Festivals
        {
            "holiday": "sundance",
            "ds": "2013-01-17",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "cannes",
            "ds": "2013-05-15",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "venice",
            "ds": "2013-08-28",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "sundance",
            "ds": "2014-01-16",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "cannes",
            "ds": "2014-05-14",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "venice",
            "ds": "2014-08-27",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "sundance",
            "ds": "2015-01-22",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "cannes",
            "ds": "2015-05-13",
            "lower_window": -7,
            "upper_window": 7,
        },
        {
            "holiday": "venice",
            "ds": "2015-09-02",
            "lower_window": -7,
            "upper_window": 7,
        },
        # 🎭 Comic Cons
        {
            "holiday": "sdcc",
            "ds": "2013-07-18",
            "lower_window": -14,
            "upper_window": 14,
        },
        {
            "holiday": "nycc",
            "ds": "2013-10-10",
            "lower_window": -14,
            "upper_window": 14,
        },
        {
            "holiday": "sdcc",
            "ds": "2014-07-24",
            "lower_window": -14,
            "upper_window": 14,
        },
        {
            "holiday": "nycc",
            "ds": "2014-10-09",
            "lower_window": -14,
            "upper_window": 14,
        },
        {
            "holiday": "sdcc",
            "ds": "2015-07-09",
            "lower_window": -14,
            "upper_window": 14,
        },
        {
            "holiday": "nycc",
            "ds": "2015-10-08",
            "lower_window": -14,
            "upper_window": 14,
        },
        # 🇫🇷 French Public Holidays
        {
            "holiday": "bastille_day",
            "ds": "2013-07-14",
            "lower_window": -3,
            "upper_window": 3,
        },
        {
            "holiday": "all_saints",
            "ds": "2013-11-01",
            "lower_window": -3,
            "upper_window": 3,
        },
        {
            "holiday": "bastille_day",
            "ds": "2014-07-14",
            "lower_window": -3,
            "upper_window": 3,
        },
        {
            "holiday": "all_saints",
            "ds": "2014-11-01",
            "lower_window": -3,
            "upper_window": 3,
        },
        {
            "holiday": "bastille_day",
            "ds": "2015-07-14",
            "lower_window": -3,
            "upper_window": 3,
        },
        {
            "holiday": "all_saints",
            "ds": "2015-11-01",
            "lower_window": -3,
            "upper_window": 3,
        },
    ]
)

# Convert date strings to datetime objects
holidays["ds"] = pd.to_datetime(holidays["ds"])

# Fill NaN values for windows (for any events that might have been missed)
holidays["lower_window"] = holidays["lower_window"].fillna(0)
holidays["upper_window"] = holidays["upper_window"].fillna(0)



In [23]:
def create_seasonality_trend(
    df_prophet,
    special_event_df,
    changepoint_prior_scale=0.1,
    n_changepoints=25,
    fourier_order=10,
):
    m = Prophet(
        yearly_seasonality=False,
        weekly_seasonality=True,
        changepoint_prior_scale=changepoint_prior_scale,
        n_changepoints=n_changepoints,
        holidays=special_event_df,
    )

    m.add_seasonality(
        name="yearly", period=365.25, fourier_order=fourier_order
    )
    m.fit(df_prophet)
    future = m.make_future_dataframe(periods=0)
    forecast = m.predict(future)
    forecast.loc[:, "ds"] = pd.to_datetime(forecast["ds"])
    # seaonality
    df_seasonality = (
        forecast[["ds", "yearly"]]
        .copy()
        .rename(columns={"yearly": "seasonality"})
    )
    df_prophet = pd.merge(df_prophet, df_seasonality, on="ds", how="inner")
    # Holiday
    df_holiday = forecast[["ds", "holidays"]]
    df_prophet = pd.merge(df_prophet, df_holiday, on="ds", how="inner")
    # Trend
    df_trend = forecast[["ds", "trend"]]
    df_prophet = pd.merge(df_prophet, df_trend, on="ds", how="inner")
    # test

    print("r2_score")
    print(r2_score(df_prophet["y"], forecast["yhat"]))
    # Plot
    fig = m.plot(forecast)
    plt.gca().set_facecolor("white")

    plt.setp(fig.axes, facecolor="white")
    plt.setp(fig.axes[0].lines, color="black")
    plt.setp(fig.axes[0].collections, edgecolor="black", facecolor="none")

    fig_comp = m.plot_components(forecast)
    plt.setp(fig_comp.axes, facecolor="white")
    plt.setp(fig_comp.axes[0].lines, color="black")
    plt.setp(fig_comp.axes[0].collections, edgecolor="black", facecolor="none")
    plt.setp(fig_comp.axes[1].lines, color="black")
    plt.setp(fig_comp.axes[1].collections, edgecolor="black", facecolor="none")
    plt.setp(fig_comp.axes[2].lines, color="black")
    plt.setp(fig_comp.axes[2].collections, edgecolor="black", facecolor="none")
    plt.setp(fig_comp.axes[3].lines, color="black")
    plt.setp(fig_comp.axes[3].collections, edgecolor="black", facecolor="none")
    plt.show()

    return df_prophet

In [None]:
df_final_result = df_final.copy()
df_final_result2 = df_final_result.copy()
df_final_result["date"] = pd.to_datetime(df_final_result["date"])
df_final_result["dayofweek"] = df_final_result["date"].dt.dayofweek
df_final_result["quarter"] = df_final_result["date"].dt.quarter
df_final_result["month"] = df_final_result["date"].dt.month
df_final_result["year"] = df_final_result["date"].dt.year
df_final_result["dayofyear"] = df_final_result["date"].dt.dayofyear
df_final_result["dayofmonth"] = df_final_result["date"].dt.day
df_final_result["weekofyear"] = df_final_result["date"].dt.isocalendar().week
df_final_result.head(3)