In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path

In [3]:
parent_dir = Path.cwd().parents[0]
sys.path.append(os.path.abspath(parent_dir))

In [4]:
from utils.DataPreprocessPipeline import DataPreprocessPipeline

preprocessor = DataPreprocessPipeline(num_lags=3, rolling_window_size=3)

In [5]:
train_df = pd.read_csv('../../data/training_data.csv')

In [6]:
train_df_gb = train_df.groupby(['date_id', 'item_dept', 'store'])[['item_qty', 'net_sales']].sum().reset_index()
train_df_gb

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales
0,1/1/2022,Beverages,ABC,1137.000,199615.300000
1,1/1/2022,Beverages,XYZ,894.000,179595.350000
2,1/1/2022,Grocery,ABC,2129.540,410870.334000
3,1/1/2022,Grocery,XYZ,3024.884,559652.290000
4,1/1/2022,Household,ABC,718.000,213757.000000
...,...,...,...,...,...
547,12/9/2021,Beverages,XYZ,1104.000,255970.701020
548,12/9/2021,Grocery,ABC,2315.716,493643.693020
549,12/9/2021,Grocery,XYZ,3219.296,588795.674994
550,12/9/2021,Household,ABC,1006.000,252415.845990


In [7]:
train_df_preprocessed = preprocessor.transform(train_df)
train_df_preprocessed.head()

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,rolling_mean_item_qty_3,rolling_std_item_qty_3,...,rolling_std_net_sales_3,cumsum_net_sales,cummean_net_sales,expanding_min_net_sales,expanding_max_net_sales,diff_net_sales,diff_net_sales_7,day_of_week,isWeekend,Is_Holiday
42,2021-11-08,Beverages,ABC,974.0,246984.682,937.0,884.0,991.0,937.333333,53.500779,...,18717.981669,1703852.0,243407.457717,199145.59601,277904.838,-23294.307,57.0,0,0,0
43,2021-11-08,Beverages,XYZ,677.0,201386.081,1043.0,1086.0,910.0,1013.0,91.755109,...,36459.734951,1675697.0,239385.355431,187931.76902,289993.0,27566.92,-102.0,0,0,0
44,2021-11-08,Grocery,ABC,2326.638,438841.43801,2646.82,2614.006,2654.582,2638.469333,21.538411,...,34985.300857,3473914.0,496273.373853,397946.965024,582206.592985,-43148.15703,-33.761,0,0,0
45,2021-11-08,Grocery,XYZ,2879.414,498037.791,3608.962,3625.864,3287.828,3507.551333,190.473559,...,52868.205452,4010397.0,572913.907431,491450.723,661655.00001,38798.438006,-107.646,0,0,0
46,2021-11-08,Household,ABC,883.0,240294.89201,998.0,1152.0,986.0,1045.333333,92.570694,...,25552.215188,1775416.0,253630.794154,209100.513996,292020.462024,-50544.119984,-160.0,0,0,0


In [8]:
train_df_preprocessed.columns

Index(['date_id', 'item_dept', 'store', 'item_qty', 'net_sales',
       'lag_item_qty_1', 'lag_item_qty_2', 'lag_item_qty_3',
       'rolling_mean_item_qty_3', 'rolling_std_item_qty_3', 'cumsum_item_qty',
       'cummean_item_qty', 'expanding_min_item_qty', 'expanding_max_item_qty',
       'diff_item_qty', 'diff_item_qty_7', 'lag_net_sales_1',
       'lag_net_sales_2', 'lag_net_sales_3', 'rolling_mean_net_sales_3',
       'rolling_std_net_sales_3', 'cumsum_net_sales', 'cummean_net_sales',
       'expanding_min_net_sales', 'expanding_max_net_sales', 'diff_net_sales',
       'diff_net_sales_7', 'day_of_week', 'isWeekend', 'Is_Holiday'],
      dtype='object')

In [9]:
from utils.model_helpers import create_X_and_targets_sales_qty, generate_forecasting_df
X, y_sales, y_item_qty = create_X_and_targets_sales_qty(train_df_preprocessed)

In [10]:
from utils.SalesItemQtyModel import SalesItemQtyModel
sales_item_qty_model = SalesItemQtyModel()

In [11]:
X.columns

Index(['lag_item_qty_1', 'lag_item_qty_2', 'lag_item_qty_3',
       'rolling_mean_item_qty_3', 'rolling_std_item_qty_3', 'cumsum_item_qty',
       'cummean_item_qty', 'expanding_min_item_qty', 'expanding_max_item_qty',
       'diff_item_qty', 'diff_item_qty_7', 'lag_net_sales_1',
       'lag_net_sales_2', 'lag_net_sales_3', 'rolling_mean_net_sales_3',
       'rolling_std_net_sales_3', 'cumsum_net_sales', 'cummean_net_sales',
       'expanding_min_net_sales', 'expanding_max_net_sales', 'diff_net_sales',
       'diff_net_sales_7', 'day_of_week', 'isWeekend', 'Is_Holiday'],
      dtype='object')

In [12]:
sales_item_qty_model.fit(X=X, y_sales=y_sales, y_item_qty=y_item_qty)

In [13]:
generate_forecasting_df(start_date='2022-02-01', end_date='2022-02-28', depts_list=train_df_gb['item_dept'].unique(),
                        stores_list=train_df_gb['store'].unique(), historical_df=train_df_gb,
                        dual_model=sales_item_qty_model, preprocessor=preprocessor)

Unnamed: 0,date_id,item_dept,store,net_sales,item_qty
0,2022-02-01,Beverages,ABC,148290.310359,603.216433
1,2022-02-01,Beverages,XYZ,110461.512193,447.655699
2,2022-02-01,Grocery,ABC,162918.708720,1069.535819
3,2022-02-01,Grocery,XYZ,122342.196362,1332.420339
4,2022-02-01,Household,ABC,125132.827043,378.458050
...,...,...,...,...,...
163,2022-02-28,Beverages,XYZ,106641.774179,398.169779
164,2022-02-28,Grocery,ABC,305157.568520,1562.803274
165,2022-02-28,Grocery,XYZ,406276.324933,2310.732320
166,2022-02-28,Household,ABC,146559.298814,476.337123


In [10]:
sales_item_qty_model.score_sales(X, y_sales)

0.140231419119676

In [10]:
sales_item_qty_model

In [7]:
a, b, c = dp_pipeline.get_processed_data()
a

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.000,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.000,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.000,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.000,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.000,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
614093,12/2/2021,Household,1.0,152.212,ABC,122245,5808307.0
614094,12/2/2021,Grocery,1.0,97.720,ABC,925,5808307.0
614095,12/2/2021,Beverages,1.0,381.107,ABC,11379,5808307.0
614096,12/3/2021,Household,1.0,115.000,ABC,112999,5808318.0


In [8]:
from utils.model_class import ModelHelpers
model_helper = ModelHelpers()


In [10]:
results = model_helper.finalized_train_predict(c)

      date_id  item_dept store  item_qty     net_sales  lag_item_qty_1  \
42 2021-11-08  Beverages   ABC   974.000  246984.68200         937.000   
43 2021-11-08  Beverages   XYZ   677.000  201386.08100        1043.000   
44 2021-11-08    Grocery   ABC  2326.638  438841.43801        2646.820   
45 2021-11-08    Grocery   XYZ  2879.414  498037.79100        3608.962   
46 2021-11-08  Household   ABC   883.000  240294.89201         998.000   

    rolling_mean_item_qty_2  rolling_std_item_qty_2  cumsum_item_qty  \
42                  910.500               37.476659         6962.000   
43                 1064.500               30.405592         6495.000   
44                 2630.413               23.203002        18100.925   
45                 3617.413               11.951519        23689.611   
46                 1075.000              108.894444         7114.000   

    cummean_item_qty  ...  rolling_std_net_sales_2  cumsum_net_sales  \
42        994.571429  ...             16471.562443

AttributeError: 'NoneType' object has no attribute 'tail'

In [10]:
train

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.000,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.000,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.000,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.000,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.000,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
614093,12/2/2021,Household,1.0,152.212,ABC,122245,5808307.0
614094,12/2/2021,Grocery,1.0,97.720,ABC,925,5808307.0
614095,12/2/2021,Beverages,1.0,381.107,ABC,11379,5808307.0
614096,12/3/2021,Household,1.0,115.000,ABC,112999,5808318.0


In [11]:
test

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,2/1/2022,Beverages,2.0,480.0,XYZ,112360,1495518.0
1,2/1/2022,Beverages,1.0,202.0,XYZ,111195,1495518.0
2,2/1/2022,Household,1.0,165.0,XYZ,41212,1495572.0
3,2/1/2022,Household,2.0,480.0,XYZ,123476,1495572.0
4,2/1/2022,Grocery,2.0,660.0,XYZ,106668,1495572.0
...,...,...,...,...,...,...,...
247619,2/28/2022,Grocery,5.0,500.0,XYZ,114873,
247620,2/28/2022,Household,4.0,2361.0,XYZ,117497,
247621,2/28/2022,Grocery,2.0,480.0,XYZ,77298,
247622,2/28/2022,Beverages,1.0,1900.0,XYZ,1444,


In [12]:
full

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,rolling_mean_item_qty_2,rolling_std_item_qty_2,cumsum_item_qty,cummean_item_qty,...,rolling_std_net_sales_2,cumsum_net_sales,cummean_net_sales,expanding_min_net_sales,expanding_max_net_sales,diff_net_sales,diff_net_sales_7,day_of_week,isWeekend,Is_Holiday
42,2021-11-08,Beverages,ABC,974.000,246984.682000,937.000,910.5000,37.476659,6962.000,994.571429,...,16471.562443,1.703852e+06,243407.457717,199145.596010,2.779048e+05,-23294.307000,57.000,0,0,0
43,2021-11-08,Beverages,XYZ,677.000,201386.081000,1043.000,1064.5000,30.405592,6495.000,927.857143,...,19492.756068,1.675697e+06,239385.355431,187931.769020,2.899930e+05,27566.920000,-102.000,0,0,0
44,2021-11-08,Grocery,ABC,2326.638,438841.438010,2646.820,2630.4130,23.203002,18100.925,2585.846429,...,30510.354432,3.473914e+06,496273.373853,397946.965024,5.822066e+05,-43148.157030,-33.761,0,0,0
45,2021-11-08,Grocery,XYZ,2879.414,498037.791000,3608.962,3617.4130,11.951519,23689.611,3384.230143,...,27434.638613,4.010397e+06,572913.907431,491450.723000,6.616550e+05,38798.438006,-107.646,0,0,0
46,2021-11-08,Household,ABC,883.000,240294.892010,998.000,1075.0000,108.894444,7114.000,1016.285714,...,35740.089990,1.775416e+06,253630.794154,209100.513996,2.920205e+05,-50544.119984,-160.000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-02-28,Beverages,XYZ,1256.000,301840.902984,1751.000,1438.5000,441.941738,130083.000,1093.134454,...,184830.662026,3.410990e+07,286637.847235,149946.090000,5.813210e+05,261390.028980,114.000,0,0,0
716,2022-02-28,Grocery,ABC,3548.268,700557.436867,3752.998,3619.8855,188.249503,325810.103,2737.900025,...,43250.321795,6.772851e+07,569147.160637,367440.183000,2.035445e+06,61165.191660,868.876,0,0,0
717,2022-02-28,Grocery,XYZ,4854.414,950084.212008,7599.386,6427.0400,1657.947613,426823.943,3586.755824,...,432172.532716,7.845215e+07,659261.805316,443513.481000,1.596598e+06,611184.257052,1239.090,0,0,0
718,2022-02-28,Household,ABC,1520.000,394323.614974,1642.000,1575.0000,94.752309,131131.000,1101.941176,...,38077.996442,3.497245e+07,293886.126171,181968.999000,4.530025e+05,53850.418996,275.000,0,0,0


In [3]:
train_df = pd.read_csv('../../data/training_data.csv')
test_df = pd.read_csv('../../data/test_data.csv')

In [4]:
from utils.data_preprocess import merge_and_preprocess
full_df = merge_and_preprocess(training=train_df, num_lags=3, rolling_window_size=3)

In [15]:
from utils.model_helpers import create_training_testing
train, test = create_training_testing(df = full_df, test_date_start='2022-01-15')

In [16]:
train.keys()

dict_keys(['train_features', 'train_net_sales', 'train_item_qty'])

In [17]:
test['train_features']

Unnamed: 0,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,lag_net_sales_3,rolling_mean_item_qty_3,rolling_std_item_qty_3,rolling_mean_net_sales_3,rolling_std_net_sales_3,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
450,866.000,1414.000,950.000,192949.53800,333398.25000,196014.50000,1076.666667,295.142903,240787.429333,80217.962929,...,1,0,756.000,1748.000,168349.26000,5.329314e+05,-548.000,-408.000,-140448.71200,-408.000
451,1078.000,855.000,837.000,270562.06000,181156.09000,178756.44796,923.333333,134.247284,210158.199320,52325.035683,...,1,0,645.000,1761.000,149946.09000,4.503627e+05,223.000,-284.000,89405.97000,-284.000
452,2187.497,2705.301,2204.186,421828.53600,577075.99500,432843.68600,2365.661333,294.254920,477249.405667,86627.618782,...,1,0,1884.134,7787.289,371493.67100,2.035445e+06,-517.804,-957.375,-155247.45900,-957.375
453,2859.478,3058.627,2899.544,504860.73800,554760.78400,509834.51500,2939.216333,105.335187,523152.012333,27486.732238,...,1,0,2396.285,5963.381,443513.48100,1.161787e+06,-199.149,-290.475,-49900.04600,-290.475
454,799.000,1146.000,949.000,194229.00000,319568.00000,272431.00000,964.666667,174.029691,262076.000000,63307.864827,...,1,0,718.000,1606.000,194229.00000,4.518945e+05,-347.000,-292.000,-125339.00000,-292.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,1048.000,1255.000,1336.000,239877.76400,337437.00000,513057.00000,1213.000000,148.522726,363457.254667,138435.956258,...,0,0,645.000,1761.000,149946.09000,5.130570e+05,-207.000,22.000,-97559.23600,22.000
548,2869.215,2587.436,2358.758,585570.03698,551861.60102,464186.11400,2605.136333,255.688411,533872.584000,62659.538296,...,0,0,1779.437,7787.289,367440.18300,2.035445e+06,281.779,286.094,33708.43596,286.094
549,4040.207,3539.002,3162.792,756342.46200,667129.62600,583466.77600,3580.667000,440.188880,668979.621333,86452.689747,...,0,0,2396.285,5963.381,443513.48100,1.161787e+06,501.205,434.602,89212.83600,434.602
550,1170.000,1236.000,1062.000,323308.26797,349645.83788,290746.99899,1156.000000,87.840765,321233.701613,29504.172160,...,0,0,718.000,1606.000,181968.99900,4.518945e+05,-66.000,180.000,-26337.56991,180.000


In [13]:
train['train_item_qty']

42      974.000
43      677.000
44     2326.638
45     2879.414
46      883.000
         ...   
547     791.000
548    2603.874
549    2996.909
550    1115.000
551    1141.000
Name: item_qty, Length: 510, dtype: float64

In [10]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 510 entries, 42 to 551
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date_id                   510 non-null    datetime64[ns]
 1   item_dept                 510 non-null    object        
 2   store                     510 non-null    object        
 3   item_qty                  510 non-null    float64       
 4   net_sales                 510 non-null    float64       
 5   lag_item_qty_1            510 non-null    float64       
 6   lag_item_qty_2            510 non-null    float64       
 7   lag_item_qty_3            510 non-null    float64       
 8   lag_net_sales_1           510 non-null    float64       
 9   lag_net_sales_2           510 non-null    float64       
 10  lag_net_sales_3           510 non-null    float64       
 11  rolling_mean_item_qty_3   510 non-null    float64       
 12  rolling_std_item_qty_3    

In [8]:
full_df.tail(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
532,2022-01-28,Household,ABC,1062.0,290746.99899,991.0,1155.0,1128.0,261352.99999,319838.0,...,0,0,718.0,1606.0,181968.999,451894.5,-164.0,107.0,-58485.00001,107.0
533,2022-01-28,Household,XYZ,1051.0,255565.0,1091.0,1103.0,1187.0,266484.029,289925.04,...,0,0,729.0,1646.0,187888.59709,382967.8,-12.0,230.0,-23441.011,230.0
534,2022-01-29,Beverages,ABC,966.0,213541.73,1319.0,1253.0,1095.0,397226.5,395367.0,...,1,0,756.0,1748.0,168349.26,532931.4,66.0,-291.0,1859.5,-291.0
535,2022-01-29,Beverages,XYZ,1255.0,337437.0,1336.0,1170.0,1123.0,513057.0,384973.0,...,1,0,645.0,1761.0,149946.09,513057.0,166.0,331.0,128084.0,331.0
536,2022-01-29,Grocery,ABC,2587.436,551861.60102,2358.758,2602.145,2612.746,464186.114,557922.715,...,1,0,1779.437,7787.289,367440.183,2035445.0,-243.387,320.002,-93736.601,320.002
537,2022-01-29,Grocery,XYZ,3539.002,667129.626,3162.792,3012.134,2913.409,583466.776,556695.964,...,1,0,2396.285,5963.381,443513.481,1161787.0,150.658,176.971,26770.812,176.971
538,2022-01-29,Household,ABC,1236.0,349645.83788,1062.0,991.0,1155.0,290746.99899,261352.99999,...,1,0,718.0,1606.0,181968.999,451894.5,71.0,46.0,29393.999,46.0
539,2022-01-29,Household,XYZ,1205.0,311037.0,1051.0,1091.0,1103.0,255565.0,266484.029,...,1,0,729.0,1646.0,187888.59709,382967.8,-40.0,80.0,-10919.029,80.0
540,2022-01-30,Beverages,ABC,923.0,196153.95,966.0,1319.0,1253.0,213541.73,397226.5,...,1,0,756.0,1748.0,168349.26,532931.4,-353.0,55.0,-183684.77,55.0
541,2022-01-30,Beverages,XYZ,1048.0,239877.764,1255.0,1336.0,1170.0,337437.0,513057.0,...,1,0,645.0,1761.0,149946.09,513057.0,-81.0,-230.0,-175620.0,-230.0


In [3]:
train_df.head(20)

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0
5,11/1/2021,Household,1.0,745.0,XYZ,115596,1475475.0
6,11/1/2021,Grocery,1.0,175.0,XYZ,98654,1475475.0
7,11/1/2021,Grocery,3.0,120.0,XYZ,8373,1475475.0
8,11/1/2021,Grocery,1.0,194.0,XYZ,106669,1475521.0
9,11/1/2021,Household,1.0,137.0,XYZ,75008,1475521.0


In [4]:
test_df.head(20)

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,2/1/2022,Beverages,2.0,480.0,XYZ,112360,1495518.0
1,2/1/2022,Beverages,1.0,202.0,XYZ,111195,1495518.0
2,2/1/2022,Household,1.0,165.0,XYZ,41212,1495572.0
3,2/1/2022,Household,2.0,480.0,XYZ,123476,1495572.0
4,2/1/2022,Grocery,2.0,660.0,XYZ,106668,1495572.0
5,2/1/2022,Household,2.0,330.0,XYZ,45968,1495572.0
6,2/1/2022,Household,1.0,420.0,XYZ,122126,1495572.0
7,2/1/2022,Beverages,2.0,640.0,XYZ,112362,1495572.0
8,2/1/2022,Grocery,3.0,420.0,XYZ,111530,1495580.0
9,2/1/2022,Household,1.0,530.0,XYZ,104754,1495580.0


In [6]:
from utils import data_preprocess


In [7]:
train_df.head()

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0


In [8]:
test_df.head()

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,2/1/2022,Beverages,2.0,480.0,XYZ,112360,1495518.0
1,2/1/2022,Beverages,1.0,202.0,XYZ,111195,1495518.0
2,2/1/2022,Household,1.0,165.0,XYZ,41212,1495572.0
3,2/1/2022,Household,2.0,480.0,XYZ,123476,1495572.0
4,2/1/2022,Grocery,2.0,660.0,XYZ,106668,1495572.0


In [9]:
full_df = pd.concat([train_df, test_df])
full_df

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
247619,2/28/2022,Grocery,5.0,500.0,XYZ,114873,
247620,2/28/2022,Household,4.0,2361.0,XYZ,117497,
247621,2/28/2022,Grocery,2.0,480.0,XYZ,77298,
247622,2/28/2022,Beverages,1.0,1900.0,XYZ,1444,


In [10]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 861722 entries, 0 to 247623
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   date_id      861722 non-null  object 
 1   item_dept    861722 non-null  object 
 2   item_qty     861722 non-null  float64
 3   net_sales    861722 non-null  float64
 4   store        861722 non-null  object 
 5   item         861722 non-null  int64  
 6   invoice_num  830607 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 52.6+ MB


In [11]:
full_df

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
247619,2/28/2022,Grocery,5.0,500.0,XYZ,114873,
247620,2/28/2022,Household,4.0,2361.0,XYZ,117497,
247621,2/28/2022,Grocery,2.0,480.0,XYZ,77298,
247622,2/28/2022,Beverages,1.0,1900.0,XYZ,1444,


In [12]:
preprocessed_full_df = data_preprocess.preprocess_data(df=full_df, num_lags=3, rolling_window_size = 3)

In [13]:
preprocessed_full_df.head(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
42,2021-11-08,Beverages,ABC,974.0,246984.682,937.0,884.0,991.0,218668.693,241963.0,...,0,0,837.0,1256.0,199145.59601,277904.838,53.0,57.0,-23294.307,57.0
43,2021-11-08,Beverages,XYZ,677.0,201386.081,1043.0,1086.0,910.0,289993.0,262426.08,...,0,0,779.0,1086.0,187931.76902,289993.0,-43.0,-102.0,27566.92,-102.0
44,2021-11-08,Grocery,ABC,2326.638,438841.43801,2646.82,2614.006,2654.582,505512.74297,548660.9,...,0,0,2155.128,3021.684,397946.965024,582206.592985,32.814,-33.761,-43148.15703,-33.761
45,2021-11-08,Grocery,XYZ,2879.414,498037.791,3608.962,3625.864,3287.828,661655.00001,622856.562004,...,0,0,2987.06,3625.864,491450.723,661655.00001,-16.902,-107.646,38798.438006,-107.646
46,2021-11-08,Household,ABC,883.0,240294.89201,998.0,1152.0,986.0,233781.880016,284326.0,...,0,0,819.0,1195.0,209100.513996,292020.462024,-154.0,-160.0,-50544.119984,-160.0
47,2021-11-08,Household,XYZ,944.0,237517.072,1252.0,1315.0,963.0,283841.0,305140.577002,...,0,0,948.0,1315.0,222311.011,305140.577002,-63.0,-180.0,-21299.577002,-180.0
48,2021-11-09,Beverages,ABC,815.0,199577.0,974.0,937.0,884.0,246984.682,218668.693,...,0,0,837.0,1256.0,199145.59601,277904.838,37.0,-325.0,28315.989,-325.0
49,2021-11-09,Beverages,XYZ,971.0,263813.854,677.0,1043.0,1086.0,201386.081,289993.0,...,0,0,677.0,1086.0,187931.76902,289993.0,-366.0,49.0,-88606.919,49.0
50,2021-11-09,Grocery,ABC,2489.046,497413.33899,2326.638,2646.82,2614.006,438841.43801,505512.74297,...,0,0,2155.128,3021.684,397946.965024,582206.592985,-320.182,-532.638,-66671.30496,-532.638
51,2021-11-09,Grocery,XYZ,3608.112,622938.069,2879.414,3608.962,3625.864,498037.791,661655.00001,...,0,0,2879.414,3625.864,491450.723,661655.00001,-729.548,-15.604,-163617.20901,-15.604


In [14]:
main_cols = ['date_id', 'item_dept', 'store', 'item_qty', 'net_sales']

In [15]:
preprocessed_full_df[main_cols].head(10)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales
42,2021-11-08,Beverages,ABC,974.0,246984.682
43,2021-11-08,Beverages,XYZ,677.0,201386.081
44,2021-11-08,Grocery,ABC,2326.638,438841.43801
45,2021-11-08,Grocery,XYZ,2879.414,498037.791
46,2021-11-08,Household,ABC,883.0,240294.89201
47,2021-11-08,Household,XYZ,944.0,237517.072
48,2021-11-09,Beverages,ABC,815.0,199577.0
49,2021-11-09,Beverages,XYZ,971.0,263813.854
50,2021-11-09,Grocery,ABC,2489.046,497413.33899
51,2021-11-09,Grocery,XYZ,3608.112,622938.069


In [16]:
depts = preprocessed_full_df['item_dept'].unique()
stores = preprocessed_full_df['store'].unique()
print(depts)
print(stores)

['Beverages' 'Grocery' 'Household']
['ABC' 'XYZ']


In [14]:
('2022', '2023')[1]

'2023'

In [11]:
feb_dates = pd.date_range(start='2022-02-01', end='2022-02-28')

In [12]:
feb_dates

DatetimeIndex(['2022-02-01', '2022-02-02', '2022-02-03', '2022-02-04',
               '2022-02-05', '2022-02-06', '2022-02-07', '2022-02-08',
               '2022-02-09', '2022-02-10', '2022-02-11', '2022-02-12',
               '2022-02-13', '2022-02-14', '2022-02-15', '2022-02-16',
               '2022-02-17', '2022-02-18', '2022-02-19', '2022-02-20',
               '2022-02-21', '2022-02-22', '2022-02-23', '2022-02-24',
               '2022-02-25', '2022-02-26', '2022-02-27', '2022-02-28'],
              dtype='datetime64[ns]', freq='D')

In [18]:
preprocessed_full_df[preprocessed_full_df['date_id'].isin(feb_dates)].head(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
552,2022-02-01,Beverages,ABC,1672.0,427627.8291,887.0,923.0,966.0,204687.95101,196153.95,...,0,0,756.0,1748.0,168349.26,532931.4,-36.0,798.0,8534.00101,798.0
553,2022-02-01,Beverages,XYZ,1152.0,268295.03401,791.0,1048.0,1255.0,179122.36501,239877.764,...,0,0,645.0,1761.0,149946.09,513057.0,-257.0,266.0,-60755.39899,266.0
554,2022-02-01,Grocery,ABC,3482.128,724896.269996,2603.874,2869.215,2587.436,542643.931,585570.03698,...,0,0,1779.437,7787.289,367440.183,2035445.0,-265.341,753.007,-42926.10598,753.007
555,2022-02-01,Grocery,XYZ,4131.394,777378.278006,2996.909,4040.207,3539.002,563043.734,756342.462,...,0,0,2396.285,5963.381,443513.481,1161787.0,-1043.298,469.89,-193298.728,469.89
556,2022-02-01,Household,ABC,1363.0,364019.0,1115.0,1170.0,1236.0,314800.26999,323308.26797,...,0,0,718.0,1606.0,181968.999,451894.5,-55.0,235.0,-8507.99798,235.0
557,2022-02-01,Household,XYZ,1340.0,343766.40399,1141.0,1298.0,1205.0,269010.675,312510.762,...,0,0,729.0,1646.0,187888.59709,382967.8,-157.0,153.0,-43500.087,153.0
558,2022-02-02,Beverages,ABC,1652.0,443960.459,1672.0,887.0,923.0,427627.8291,204687.95101,...,0,0,756.0,1748.0,168349.26,532931.4,785.0,557.0,222939.87809,557.0
559,2022-02-02,Beverages,XYZ,1392.0,379689.474,1152.0,791.0,1048.0,268295.03401,179122.36501,...,0,0,645.0,1761.0,149946.09,513057.0,361.0,269.0,89172.669,269.0
560,2022-02-02,Grocery,ABC,3256.287,636533.876996,3482.128,2603.874,2869.215,724896.269996,542643.931,...,0,0,1779.437,7787.289,367440.183,2035445.0,878.254,643.541,182252.338996,643.541
561,2022-02-02,Grocery,XYZ,4251.348,808260.989004,4131.394,2996.909,4040.207,777378.278006,563043.734,...,0,0,2396.285,5963.381,443513.481,1161787.0,1134.485,1337.939,214334.544006,1337.939


In [19]:
from itertools import product
combinations = list(product(feb_dates, depts, stores))

In [20]:
production_df = pd.DataFrame(combinations, columns=['date_id', 'item_dept', 'store'])

In [21]:
production_df

Unnamed: 0,date_id,item_dept,store
0,2022-02-01,Beverages,ABC
1,2022-02-01,Beverages,XYZ
2,2022-02-01,Grocery,ABC
3,2022-02-01,Grocery,XYZ
4,2022-02-01,Household,ABC
...,...,...,...
163,2022-02-28,Beverages,XYZ
164,2022-02-28,Grocery,ABC
165,2022-02-28,Grocery,XYZ
166,2022-02-28,Household,ABC


In [22]:
preprocessed_full_df[preprocessed_full_df['date_id'] == '2022-02-01']

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
552,2022-02-01,Beverages,ABC,1672.0,427627.8291,887.0,923.0,966.0,204687.95101,196153.95,...,0,0,756.0,1748.0,168349.26,532931.4,-36.0,798.0,8534.00101,798.0
553,2022-02-01,Beverages,XYZ,1152.0,268295.03401,791.0,1048.0,1255.0,179122.36501,239877.764,...,0,0,645.0,1761.0,149946.09,513057.0,-257.0,266.0,-60755.39899,266.0
554,2022-02-01,Grocery,ABC,3482.128,724896.269996,2603.874,2869.215,2587.436,542643.931,585570.03698,...,0,0,1779.437,7787.289,367440.183,2035445.0,-265.341,753.007,-42926.10598,753.007
555,2022-02-01,Grocery,XYZ,4131.394,777378.278006,2996.909,4040.207,3539.002,563043.734,756342.462,...,0,0,2396.285,5963.381,443513.481,1161787.0,-1043.298,469.89,-193298.728,469.89
556,2022-02-01,Household,ABC,1363.0,364019.0,1115.0,1170.0,1236.0,314800.26999,323308.26797,...,0,0,718.0,1606.0,181968.999,451894.5,-55.0,235.0,-8507.99798,235.0
557,2022-02-01,Household,XYZ,1340.0,343766.40399,1141.0,1298.0,1205.0,269010.675,312510.762,...,0,0,729.0,1646.0,187888.59709,382967.8,-157.0,153.0,-43500.087,153.0


In [23]:
prod_df_one_date = production_df[production_df['date_id'] == '2022-02-01']
prod_df_one_date

Unnamed: 0,date_id,item_dept,store
0,2022-02-01,Beverages,ABC
1,2022-02-01,Beverages,XYZ
2,2022-02-01,Grocery,ABC
3,2022-02-01,Grocery,XYZ
4,2022-02-01,Household,ABC
5,2022-02-01,Household,XYZ


In [24]:
preprocessed_full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 678 entries, 42 to 719
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date_id                   678 non-null    datetime64[ns]
 1   item_dept                 678 non-null    object        
 2   store                     678 non-null    object        
 3   item_qty                  678 non-null    float64       
 4   net_sales                 678 non-null    float64       
 5   lag_item_qty_1            678 non-null    float64       
 6   lag_item_qty_2            678 non-null    float64       
 7   lag_item_qty_3            678 non-null    float64       
 8   lag_net_sales_1           678 non-null    float64       
 9   lag_net_sales_2           678 non-null    float64       
 10  lag_net_sales_3           678 non-null    float64       
 11  rolling_mean_item_qty_3   678 non-null    float64       
 12  rolling_std_item_qty_3    

In [25]:
preprocessed_full_df[['date_id']].describe()

Unnamed: 0,date_id
count,678
mean,2022-01-03 00:00:00
min,2021-11-08 00:00:00
25%,2021-12-06 00:00:00
50%,2022-01-03 00:00:00
75%,2022-01-31 00:00:00
max,2022-02-28 00:00:00


In [26]:
#separate train and test based on dates (before feb -> train | feb -> test)
train_processed = preprocessed_full_df[preprocessed_full_df['date_id'] < '2022-02-01']
test_processed = preprocessed_full_df[preprocessed_full_df['date_id'] >= '2022-02-01']

In [27]:
production_df['date_id'].unique()

<DatetimeArray>
['2022-02-01 00:00:00', '2022-02-02 00:00:00', '2022-02-03 00:00:00',
 '2022-02-04 00:00:00', '2022-02-05 00:00:00', '2022-02-06 00:00:00',
 '2022-02-07 00:00:00', '2022-02-08 00:00:00', '2022-02-09 00:00:00',
 '2022-02-10 00:00:00', '2022-02-11 00:00:00', '2022-02-12 00:00:00',
 '2022-02-13 00:00:00', '2022-02-14 00:00:00', '2022-02-15 00:00:00',
 '2022-02-16 00:00:00', '2022-02-17 00:00:00', '2022-02-18 00:00:00',
 '2022-02-19 00:00:00', '2022-02-20 00:00:00', '2022-02-21 00:00:00',
 '2022-02-22 00:00:00', '2022-02-23 00:00:00', '2022-02-24 00:00:00',
 '2022-02-25 00:00:00', '2022-02-26 00:00:00', '2022-02-27 00:00:00',
 '2022-02-28 00:00:00']
Length: 28, dtype: datetime64[ns]

In [28]:
historical_df = train_processed.copy()[['date_id', 'store', 'item_dept', 'net_sales', 'item_qty']].sort_values(by = ['date_id','item_dept', 'store'])


In [29]:
from utils.model_helpers import recursive_forecasting
recursive_forecasting(historical_df=historical_df, stores_list=stores,
                      depts_list=depts, production_df=production_df)

Unnamed: 0,date_id,item_dept,store,net_sales,item_qty
0,2022-02-01,Beverages,ABC,0.0,0.0
1,2022-02-01,Beverages,XYZ,1.0,1.0
2,2022-02-01,Grocery,ABC,2.0,2.0
3,2022-02-01,Grocery,XYZ,3.0,3.0
4,2022-02-01,Household,ABC,4.0,4.0
...,...,...,...,...,...
163,2022-02-28,Beverages,XYZ,163.0,163.0
164,2022-02-28,Grocery,ABC,164.0,164.0
165,2022-02-28,Grocery,XYZ,165.0,165.0
166,2022-02-28,Household,ABC,166.0,166.0


In [None]:
combined_df = None
for current_date in production_df['date_id'].unique():
    for dept in depts:
        for store in stores:
            print(current_date)

            relevant_historical_data = historical_df[(historical_df['store'] == store) & (historical_df['item_dept'] == dept)]
            
            current_day_data = production_df[
                (production_df['date_id'] == current_date) &
                (production_df['store'] == store) &
                (production_df['item_dept'] == dept)
            ]

            combined_df = pd.concat([relevant_historical_data, current_day_data], ignore_index=True).tail(7)

            combined_df_processed = data_preprocess.preprocess_data(df=combined_df, num_lags=3, rolling_window_size = 3)

            current_day_data['net_sales'] = 2
            current_day_data['item_qty'] = 2

            production_df.loc[
                (production_df['date_id'] == current_date) & 
                (production_df['store'] == store) & 
                (production_df['item_dept'] == dept), 
                ['net_sales', 'item_qty']
            ] = current_day_data[['net_sales', 'item_qty']].values

            historical_df = pd.concat([historical_df, current_day_data], ignore_index=True)
            
            #print(combined_df)
            # break
        # break
    # break




In [66]:
production_df

Unnamed: 0,date_id,item_dept,store,net_sales,item_qty
0,2022-02-01,Beverages,ABC,2.0,2.0
1,2022-02-01,Beverages,XYZ,2.0,2.0
2,2022-02-01,Grocery,ABC,2.0,2.0
3,2022-02-01,Grocery,XYZ,2.0,2.0
4,2022-02-01,Household,ABC,2.0,2.0
...,...,...,...,...,...
163,2022-02-28,Beverages,XYZ,2.0,2.0
164,2022-02-28,Grocery,ABC,2.0,2.0
165,2022-02-28,Grocery,XYZ,2.0,2.0
166,2022-02-28,Household,ABC,2.0,2.0


In [67]:
historical_df

Unnamed: 0,date_id,store,item_dept,net_sales,item_qty
0,2021-11-01,ABC,Beverages,254547.79901,917.000
1,2021-11-01,XYZ,Beverages,187931.76902,779.000
2,2021-11-01,ABC,Grocery,437286.62402,2360.399
3,2021-11-01,XYZ,Grocery,491450.72300,2987.060
4,2021-11-01,ABC,Household,247336.26601,1043.000
...,...,...,...,...,...
715,2022-02-28,XYZ,Beverages,2.00000,2.000
716,2022-02-28,ABC,Grocery,2.00000,2.000
717,2022-02-28,XYZ,Grocery,2.00000,2.000
718,2022-02-28,ABC,Household,2.00000,2.000


In [31]:
combined_df

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
86,2022-01-26,Beverages,ABC,1095.0,382074.0,874.0,992.0,868.0,253524.0,308841.5,...,0.0,0.0,756.0,1748.0,168349.26,532931.43602,-118.0,-142.0,-55317.5,-142.0
87,2022-01-27,Beverages,ABC,1253.0,395367.0,1095.0,874.0,992.0,382074.0,253524.0,...,0.0,0.0,756.0,1748.0,168349.26,532931.43602,221.0,220.0,128550.0,220.0
88,2022-01-28,Beverages,ABC,1319.0,397226.5,1253.0,1095.0,874.0,395367.0,382074.0,...,0.0,0.0,756.0,1748.0,168349.26,532931.43602,158.0,158.0,13293.0,158.0
89,2022-01-29,Beverages,ABC,966.0,213541.73,1319.0,1253.0,1095.0,397226.5,395367.0,...,1.0,0.0,756.0,1748.0,168349.26,532931.43602,66.0,-291.0,1859.5,-291.0
90,2022-01-30,Beverages,ABC,923.0,196153.95,966.0,1319.0,1253.0,213541.73,397226.5,...,1.0,0.0,756.0,1748.0,168349.26,532931.43602,-353.0,55.0,-183684.77,55.0
91,2022-01-31,Beverages,ABC,887.0,204687.95101,923.0,966.0,1319.0,196153.95,213541.73,...,0.0,0.0,756.0,1748.0,168349.26,532931.43602,-43.0,-105.0,-17387.78,-105.0
92,2022-02-01,Beverages,ABC,,,,,,,,...,,,,,,,,,,


In [14]:
train_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 552 entries, 0 to 551
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date_id                   552 non-null    datetime64[ns]
 1   item_dept                 552 non-null    object        
 2   store                     552 non-null    object        
 3   item_qty                  552 non-null    float64       
 4   net_sales                 552 non-null    float64       
 5   lag_item_qty_1            546 non-null    float64       
 6   lag_item_qty_2            540 non-null    float64       
 7   lag_item_qty_3            534 non-null    float64       
 8   lag_net_sales_1           546 non-null    float64       
 9   lag_net_sales_2           540 non-null    float64       
 10  lag_net_sales_3           534 non-null    float64       
 11  rolling_mean_item_qty_3   534 non-null    float64       
 12  rolling_std_item_qty_3    5

In [15]:
test_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 168 entries, 552 to 719
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date_id                   168 non-null    datetime64[ns]
 1   item_dept                 168 non-null    object        
 2   store                     168 non-null    object        
 3   item_qty                  168 non-null    float64       
 4   net_sales                 168 non-null    float64       
 5   lag_item_qty_1            168 non-null    float64       
 6   lag_item_qty_2            168 non-null    float64       
 7   lag_item_qty_3            168 non-null    float64       
 8   lag_net_sales_1           168 non-null    float64       
 9   lag_net_sales_2           168 non-null    float64       
 10  lag_net_sales_3           168 non-null    float64       
 11  rolling_mean_item_qty_3   168 non-null    float64       
 12  rolling_std_item_qty_3   

In [16]:
test_processed.head(30)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
552,2022-02-01,Beverages,ABC,1672.0,427627.8291,887.0,923.0,966.0,204687.95101,196153.95,...,0,0,756.0,1748.0,168349.26,532931.4,-36.0,798.0,8534.00101,798.0
553,2022-02-01,Beverages,XYZ,1152.0,268295.03401,791.0,1048.0,1255.0,179122.36501,239877.764,...,0,0,645.0,1761.0,149946.09,513057.0,-257.0,266.0,-60755.39899,266.0
554,2022-02-01,Grocery,ABC,3482.128,724896.269996,2603.874,2869.215,2587.436,542643.931,585570.03698,...,0,0,1779.437,7787.289,367440.183,2035445.0,-265.341,753.007,-42926.10598,753.007
555,2022-02-01,Grocery,XYZ,4131.394,777378.278006,2996.909,4040.207,3539.002,563043.734,756342.462,...,0,0,2396.285,5963.381,443513.481,1161787.0,-1043.298,469.89,-193298.728,469.89
556,2022-02-01,Household,ABC,1363.0,364019.0,1115.0,1170.0,1236.0,314800.26999,323308.26797,...,0,0,718.0,1606.0,181968.999,451894.5,-55.0,235.0,-8507.99798,235.0
557,2022-02-01,Household,XYZ,1340.0,343766.40399,1141.0,1298.0,1205.0,269010.675,312510.762,...,0,0,729.0,1646.0,187888.59709,382967.8,-157.0,153.0,-43500.087,153.0
558,2022-02-02,Beverages,ABC,1652.0,443960.459,1672.0,887.0,923.0,427627.8291,204687.95101,...,0,0,756.0,1748.0,168349.26,532931.4,785.0,557.0,222939.87809,557.0
559,2022-02-02,Beverages,XYZ,1392.0,379689.474,1152.0,791.0,1048.0,268295.03401,179122.36501,...,0,0,645.0,1761.0,149946.09,513057.0,361.0,269.0,89172.669,269.0
560,2022-02-02,Grocery,ABC,3256.287,636533.876996,3482.128,2603.874,2869.215,724896.269996,542643.931,...,0,0,1779.437,7787.289,367440.183,2035445.0,878.254,643.541,182252.338996,643.541
561,2022-02-02,Grocery,XYZ,4251.348,808260.989004,4131.394,2996.909,4040.207,777378.278006,563043.734,...,0,0,2396.285,5963.381,443513.481,1161787.0,1134.485,1337.939,214334.544006,1337.939


## Prep Data for Modeling

In [17]:
train_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 552 entries, 0 to 551
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date_id                   552 non-null    datetime64[ns]
 1   item_dept                 552 non-null    object        
 2   store                     552 non-null    object        
 3   item_qty                  552 non-null    float64       
 4   net_sales                 552 non-null    float64       
 5   lag_item_qty_1            546 non-null    float64       
 6   lag_item_qty_2            540 non-null    float64       
 7   lag_item_qty_3            534 non-null    float64       
 8   lag_net_sales_1           546 non-null    float64       
 9   lag_net_sales_2           540 non-null    float64       
 10  lag_net_sales_3           534 non-null    float64       
 11  rolling_mean_item_qty_3   534 non-null    float64       
 12  rolling_std_item_qty_3    5

In [18]:
train_processed_non_null = train_processed.dropna(axis = 0)
train_processed_non_null

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,lag_net_sales_1,lag_net_sales_2,...,isWeekend,Is_Holiday,expanding_min_item_qty,expanding_max_item_qty,expanding_min_net_sales,expanding_max_net_sales,diff_item_qty,diff_item_qty_7,diff_net_sales,diff_net_sales_7
42,2021-11-08,Beverages,ABC,974.000,246984.68200,937.000,884.000,991.000,218668.693000,241963.000000,...,0,0,837.000,1256.000,199145.596010,2.779048e+05,53.000,57.000,-23294.307000,57.000
43,2021-11-08,Beverages,XYZ,677.000,201386.08100,1043.000,1086.000,910.000,289993.000000,262426.080000,...,0,0,779.000,1086.000,187931.769020,2.899930e+05,-43.000,-102.000,27566.920000,-102.000
44,2021-11-08,Grocery,ABC,2326.638,438841.43801,2646.820,2614.006,2654.582,505512.742970,548660.900000,...,0,0,2155.128,3021.684,397946.965024,5.822066e+05,32.814,-33.761,-43148.157030,-33.761
45,2021-11-08,Grocery,XYZ,2879.414,498037.79100,3608.962,3625.864,3287.828,661655.000010,622856.562004,...,0,0,2987.060,3625.864,491450.723000,6.616550e+05,-16.902,-107.646,38798.438006,-107.646
46,2021-11-08,Household,ABC,883.000,240294.89201,998.000,1152.000,986.000,233781.880016,284326.000000,...,0,0,819.000,1195.000,209100.513996,2.920205e+05,-154.000,-160.000,-50544.119984,-160.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,2022-01-31,Beverages,XYZ,791.000,179122.36501,1048.000,1255.000,1336.000,239877.764000,337437.000000,...,0,0,645.000,1761.000,149946.090000,5.130570e+05,-207.000,22.000,-97559.236000,22.000
548,2022-01-31,Grocery,ABC,2603.874,542643.93100,2869.215,2587.436,2358.758,585570.036980,551861.601020,...,0,0,1779.437,7787.289,367440.183000,2.035445e+06,281.779,286.094,33708.435960,286.094
549,2022-01-31,Grocery,XYZ,2996.909,563043.73400,4040.207,3539.002,3162.792,756342.462000,667129.626000,...,0,0,2396.285,5963.381,443513.481000,1.161787e+06,501.205,434.602,89212.836000,434.602
550,2022-01-31,Household,ABC,1115.000,314800.26999,1170.000,1236.000,1062.000,323308.267970,349645.837880,...,0,0,718.000,1606.000,181968.999000,4.518945e+05,-66.000,180.000,-26337.569910,180.000


In [19]:
train_processed_non_null.columns

Index(['date_id', 'item_dept', 'store', 'item_qty', 'net_sales',
       'lag_item_qty_1', 'lag_item_qty_2', 'lag_item_qty_3', 'lag_net_sales_1',
       'lag_net_sales_2', 'lag_net_sales_3', 'rolling_mean_item_qty_3',
       'rolling_std_item_qty_3', 'rolling_mean_net_sales_3',
       'rolling_std_net_sales_3', 'cumsum_item_qty', 'cummean_item_qty',
       'cumsum_net_sales', 'cummean_net_sales', 'day_of_week', 'isWeekend',
       'Is_Holiday', 'expanding_min_item_qty', 'expanding_max_item_qty',
       'expanding_min_net_sales', 'expanding_max_net_sales', 'diff_item_qty',
       'diff_item_qty_7', 'diff_net_sales', 'diff_net_sales_7'],
      dtype='object')

In [20]:
cols_drop = ['date_id', 'item_dept', 'store', 'item_qty', 'net_sales']
X_train = train_processed_non_null.drop(columns=cols_drop)
X_test = test_processed.drop(columns=cols_drop)

y_train_sales = train_processed_non_null['net_sales'].values
y_test_sales = test_processed['net_sales'].values

y_train_item_qty = train_processed_non_null['item_qty'].values
y_test_item_qty = test_processed['item_qty'].values

In [27]:
from xgboost import XGBRegressor
xgb_sales = XGBRegressor(random_state = 0)
xgb_qty = XGBRegressor(radom_state = 0)


In [28]:
xgb_sales.fit(X_train, y_train_sales)
xgb_qty.fit(X_train, y_train_item_qty)

Parameters: { "radom_state" } are not used.



In [29]:
y_pred_sales = xgb_sales.predict(X_test)
y_pred_qty = xgb_qty.predict(X_test)

In [25]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
print(mean_absolute_percentage_error(y_test_sales, y_pred_sales))

0.1662188580813835


In [30]:
print(mean_absolute_percentage_error(y_test_item_qty, y_pred_qty))

0.1781680194157962


In [26]:
print(mean_absolute_error(y_test_sales, y_pred_sales))

86906.23141655054


In [31]:
print(mean_absolute_error(y_test_item_qty, y_pred_qty))

392.8042492966425


# Other Workings

In [18]:
# Preprocessing
full_df['date_id'] = pd.to_datetime(full_df['date_id'])
full_df_gb = full_df.groupby(['date_id', 'item_dept', 'store'])[['item_qty', 'net_sales']].sum().reset_index()

In [19]:
full_df_gb.describe()

Unnamed: 0,date_id,item_qty,net_sales
count,720,720.0,720.0
mean,2021-12-30 12:00:00,1818.321844,402001.6
min,2021-11-01 00:00:00,645.0,149946.1
25%,2021-11-30 18:00:00,1027.75,255959.8
50%,2021-12-30 12:00:00,1280.0,336784.6
75%,2022-01-29 06:00:00,2604.1745,522575.4
max,2022-02-28 00:00:00,7787.289,2035445.0
std,,1090.859874,194157.1


In [20]:
full_df_gb.head(18)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales
0,2021-11-01,Beverages,ABC,917.0,254547.79901
1,2021-11-01,Beverages,XYZ,779.0,187931.76902
2,2021-11-01,Grocery,ABC,2360.399,437286.62402
3,2021-11-01,Grocery,XYZ,2987.06,491450.723
4,2021-11-01,Household,ABC,1043.0,247336.26601
5,2021-11-01,Household,XYZ,1124.0,247156.009988
6,2021-11-02,Beverages,ABC,1140.0,277904.838
7,2021-11-02,Beverages,XYZ,922.0,251330.13
8,2021-11-02,Grocery,ABC,3021.684,582206.592985
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002


In [21]:
full_df_gb.head(18)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales
0,2021-11-01,Beverages,ABC,917.0,254547.79901
1,2021-11-01,Beverages,XYZ,779.0,187931.76902
2,2021-11-01,Grocery,ABC,2360.399,437286.62402
3,2021-11-01,Grocery,XYZ,2987.06,491450.723
4,2021-11-01,Household,ABC,1043.0,247336.26601
5,2021-11-01,Household,XYZ,1124.0,247156.009988
6,2021-11-02,Beverages,ABC,1140.0,277904.838
7,2021-11-02,Beverages,XYZ,922.0,251330.13
8,2021-11-02,Grocery,ABC,3021.684,582206.592985
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002


In [22]:
full_df_gb_lags = data_preprocess.create_lag_features(df=full_df_gb, feature_name='item_qty', num_lags=2)

In [23]:
data_preprocess.create_expanding_window_features(df=full_df_gb_lags, feature_name='item_qty').head(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,expanding_min_item_qty,expanding_max_item_qty
0,2021-11-01,Beverages,ABC,917.0,254547.79901,,,,
1,2021-11-01,Beverages,XYZ,779.0,187931.76902,,,,
2,2021-11-01,Grocery,ABC,2360.399,437286.62402,,,,
3,2021-11-01,Grocery,XYZ,2987.06,491450.723,,,,
4,2021-11-01,Household,ABC,1043.0,247336.26601,,,,
5,2021-11-01,Household,XYZ,1124.0,247156.009988,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838,917.0,,917.0,917.0
7,2021-11-02,Beverages,XYZ,922.0,251330.13,779.0,,779.0,779.0
8,2021-11-02,Grocery,ABC,3021.684,582206.592985,2360.399,,2360.399,2360.399
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002,2987.06,,2987.06,2987.06


In [24]:
# Feature Engineering
full_df_gb['day_of_week'] = full_df_gb['date_id'].dt.dayofweek
full_df_gb['month'] = full_df_gb['date_id'].dt.month

In [25]:
full_df_gb

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month
0,2021-11-01,Beverages,ABC,917.000,254547.799010,0,11
1,2021-11-01,Beverages,XYZ,779.000,187931.769020,0,11
2,2021-11-01,Grocery,ABC,2360.399,437286.624020,0,11
3,2021-11-01,Grocery,XYZ,2987.060,491450.723000,0,11
4,2021-11-01,Household,ABC,1043.000,247336.266010,0,11
...,...,...,...,...,...,...,...
715,2022-02-28,Beverages,XYZ,1256.000,301840.902984,0,2
716,2022-02-28,Grocery,ABC,3548.268,700557.436867,0,2
717,2022-02-28,Grocery,XYZ,4854.414,950084.212008,0,2
718,2022-02-28,Household,ABC,1520.000,394323.614974,0,2


In [26]:
# Lag features
full_df_gb['lag_qty_1'] = full_df_gb.groupby(['item_dept', 'store'])['item_qty'].shift(1)
full_df_gb['lag_sales_1'] = full_df_gb.groupby(['item_dept', 'store'])['net_sales'].shift(1)

In [27]:
full_df_gb.head(12)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1
0,2021-11-01,Beverages,ABC,917.0,254547.79901,0,11,,
1,2021-11-01,Beverages,XYZ,779.0,187931.76902,0,11,,
2,2021-11-01,Grocery,ABC,2360.399,437286.62402,0,11,,
3,2021-11-01,Grocery,XYZ,2987.06,491450.723,0,11,,
4,2021-11-01,Household,ABC,1043.0,247336.26601,0,11,,
5,2021-11-01,Household,XYZ,1124.0,247156.009988,0,11,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838,1,11,917.0,254547.79901
7,2021-11-02,Beverages,XYZ,922.0,251330.13,1,11,779.0,187931.76902
8,2021-11-02,Grocery,ABC,3021.684,582206.592985,1,11,2360.399,437286.62402
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002,1,11,2987.06,491450.723


In [28]:
full_df_gb['expanding_min_qty'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cummin()
full_df_gb['expanding_max_qty'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cummax()
full_df_gb['expanding_prod'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cumprod()

In [29]:
full_df_gb['cumsum_qty'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cumsum()
full_df_gb['cummean_qty'] = full_df_gb['cumsum_qty'] / full_df_gb.groupby(['store', 'item_dept'])['lag_qty_1'].cumcount()


In [30]:
df_process[f'diff_{feature_name}_7'] = df_process.groupby(['item_dept', 'store'])['item_qty'].diff(7)

NameError: name 'df_process' is not defined

In [None]:
full_df_gb[(full_df_gb['store'] == 'ABC') & (full_df_gb['item_dept'] == 'Beverages')].head(10)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,expanding_min_qty,cumsum_qty,cummean_qty,expanding_max_qty,expanding_prod
0,2021-11-01,Beverages,ABC,917.0,254547.79901,0,11,,,,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838,1,11,917.0,254547.79901,917.0,917.0,917.0,917.0,917.0
12,2021-11-03,Beverages,ABC,837.0,199145.59601,2,11,1140.0,277904.838,917.0,2057.0,1028.5,1140.0,1045380.0
18,2021-11-04,Beverages,ABC,1256.0,255926.913,3,11,837.0,199145.59601,837.0,2894.0,964.666667,1140.0,874983100.0
24,2021-11-05,Beverages,ABC,991.0,255695.365,4,11,1256.0,255926.913,837.0,4150.0,1037.5,1256.0,1098979000000.0
30,2021-11-06,Beverages,ABC,884.0,241963.0,5,11,991.0,255695.365,837.0,5141.0,1028.2,1256.0,1089088000000000.0
36,2021-11-07,Beverages,ABC,937.0,218668.693,6,11,884.0,241963.0,837.0,6025.0,1004.166667,1256.0,9.627537e+17
42,2021-11-08,Beverages,ABC,974.0,246984.682,0,11,937.0,218668.693,837.0,6962.0,994.571429,1256.0,9.021002e+20
48,2021-11-09,Beverages,ABC,815.0,199577.0,1,11,974.0,246984.682,837.0,7936.0,992.0,1256.0,8.786456e+23
54,2021-11-10,Beverages,ABC,924.0,238515.243,2,11,815.0,199577.0,815.0,8751.0,972.333333,1256.0,7.160962e+26


In [None]:
# Lag features
full_df_gb['lag_qty_2'] = full_df_gb.groupby(['item_dept', 'store'])['item_qty'].shift(2)
full_df_gb['lag_sales_2'] = full_df_gb.groupby(['item_dept', 'store'])['net_sales'].shift(2)

In [None]:
full_df_gb[(full_df_gb['item_dept'] == 'Beverages') & (full_df_gb['store'] == 'ABC')]

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
0,2021-11-01,Beverages,ABC,917.0,254547.799010,0,11,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838000,1,11,917.0,254547.799010,,
12,2021-11-03,Beverages,ABC,837.0,199145.596010,2,11,1140.0,277904.838000,917.0,254547.79901
18,2021-11-04,Beverages,ABC,1256.0,255926.913000,3,11,837.0,199145.596010,1140.0,277904.83800
24,2021-11-05,Beverages,ABC,991.0,255695.365000,4,11,1256.0,255926.913000,837.0,199145.59601
...,...,...,...,...,...,...,...,...,...,...,...
690,2022-02-24,Beverages,ABC,1531.0,562260.630000,3,2,1094.0,250334.291980,1464.0,409491.97300
696,2022-02-25,Beverages,ABC,1358.0,331291.918000,4,2,1531.0,562260.630000,1094.0,250334.29198
702,2022-02-26,Beverages,ABC,1267.0,346495.176030,5,2,1358.0,331291.918000,1531.0,562260.63000
708,2022-02-27,Beverages,ABC,1535.0,448158.400996,6,2,1267.0,346495.176030,1358.0,331291.91800


In [None]:
full_df_gb[['date_id','item_dept','lag_qty_1', 'lag_qty_2', 'lag_sales_1', 'lag_sales_2', 'item_qty', 'net_sales']].dropna().head(20)

Unnamed: 0,date_id,item_dept,lag_qty_1,lag_qty_2,lag_sales_1,lag_sales_2,item_qty,net_sales
12,2021-11-03,Beverages,1140.0,917.0,277904.838,254547.79901,837.0,199145.59601
13,2021-11-03,Beverages,922.0,779.0,251330.13,187931.76902,841.0,228560.219
14,2021-11-03,Grocery,3021.684,2360.399,582206.592985,437286.62402,2155.128,397946.965024
15,2021-11-03,Grocery,3623.716,2987.06,592195.489002,491450.723,3288.1,574684.181
16,2021-11-03,Household,1195.0,1043.0,292020.462024,247336.26601,819.0,209100.513996
17,2021-11-03,Household,1196.0,1124.0,262403.349,247156.009988,1108.0,249315.458
18,2021-11-04,Beverages,837.0,1140.0,199145.59601,277904.838,1256.0,255926.913
19,2021-11-04,Beverages,841.0,922.0,228560.219,251330.13,914.0,237710.29
20,2021-11-04,Grocery,2155.128,3021.684,397946.965024,582206.592985,2648.306,522916.128
21,2021-11-04,Grocery,3288.1,3623.716,574684.181,592195.489002,3268.081,510482.668


In [None]:
full_df_gb[(full_df_gb['item_dept'] == 'Beverages') & (full_df_gb['store'] == 'ABC')]

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
0,2021-11-01,Beverages,ABC,917.0,254547.799010,0,11,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838000,1,11,917.0,254547.799010,,
12,2021-11-03,Beverages,ABC,837.0,199145.596010,2,11,1140.0,277904.838000,917.0,254547.79901
18,2021-11-04,Beverages,ABC,1256.0,255926.913000,3,11,837.0,199145.596010,1140.0,277904.83800
24,2021-11-05,Beverages,ABC,991.0,255695.365000,4,11,1256.0,255926.913000,837.0,199145.59601
...,...,...,...,...,...,...,...,...,...,...,...
690,2022-02-24,Beverages,ABC,1531.0,562260.630000,3,2,1094.0,250334.291980,1464.0,409491.97300
696,2022-02-25,Beverages,ABC,1358.0,331291.918000,4,2,1531.0,562260.630000,1094.0,250334.29198
702,2022-02-26,Beverages,ABC,1267.0,346495.176030,5,2,1358.0,331291.918000,1531.0,562260.63000
708,2022-02-27,Beverages,ABC,1535.0,448158.400996,6,2,1267.0,346495.176030,1358.0,331291.91800


In [None]:
full_df_gb.columns

Index(['date_id', 'item_dept', 'store', 'item_qty', 'net_sales', 'day_of_week',
       'month', 'lag_qty_1', 'lag_sales_1', 'lag_qty_2', 'lag_sales_2'],
      dtype='object')

In [None]:
#rolling window creation
# Rolling mean and standard deviation for item_qty
window_size  = 3
full_df_gb[f'rolling_mean_qty_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].transform(lambda x: x.rolling(window=window_size).mean())
full_df_gb[f'rolling_std_qty_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].transform(lambda x: x.rolling(window=window_size).std())

# Rolling mean and standard deviation for net_sales
full_df_gb[f'rolling_mean_sales_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_sales_1'].transform(lambda x: x.rolling(window=window_size).mean())
full_df_gb[f'rolling_std_sales_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_sales_1'].transform(lambda x: x.rolling(window=window_size).std())


In [None]:
full_df_gb.head(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2,rolling_mean_qty_3,rolling_std_qty_3,rolling_mean_sales_3,rolling_std_sales_3,cumsum_qty,cummean_qty,Is_Holiday
0,2021-11-01,Beverages,ABC,917.0,254547.79901,0,11,,,,,,,,,917.0,917.0,False
1,2021-11-01,Beverages,XYZ,779.0,187931.76902,0,11,,,,,,,,,779.0,1028.5,False
2,2021-11-01,Grocery,ABC,2360.399,437286.62402,0,11,,,,,,,,,2360.399,964.666667,False
3,2021-11-01,Grocery,XYZ,2987.06,491450.723,0,11,,,,,,,,,2987.06,1037.5,False
4,2021-11-01,Household,ABC,1043.0,247336.26601,0,11,,,,,,,,,1043.0,1028.2,False
5,2021-11-01,Household,XYZ,1124.0,247156.009988,0,11,,,,,,,,,1124.0,1004.166667,False
6,2021-11-02,Beverages,ABC,1140.0,277904.838,1,11,917.0,254547.79901,,,,,,,2057.0,994.571429,False
7,2021-11-02,Beverages,XYZ,922.0,251330.13,1,11,779.0,187931.76902,,,,,,,1701.0,992.0,False
8,2021-11-02,Grocery,ABC,3021.684,582206.592985,1,11,2360.399,437286.62402,,,,,,,5382.083,972.333333,False
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002,1,11,2987.06,491450.723,,,,,,,6610.776,967.5,False


In [None]:
full_df_gb

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2,rolling_mean_qty_3,rolling_std_qty_3,rolling_mean_sales_3,rolling_std_sales_3,cumsum_qty,cummean_qty,Is_Holiday
0,2021-11-01,Beverages,ABC,917.000,254547.799010,0,11,,,,,,,,,917.000,917.000000,0
1,2021-11-01,Beverages,XYZ,779.000,187931.769020,0,11,,,,,,,,,779.000,1028.500000,0
2,2021-11-01,Grocery,ABC,2360.399,437286.624020,0,11,,,,,,,,,2360.399,964.666667,0
3,2021-11-01,Grocery,XYZ,2987.060,491450.723000,0,11,,,,,,,,,2987.060,1037.500000,0
4,2021-11-01,Household,ABC,1043.000,247336.266010,0,11,,,,,,,,,1043.000,1028.200000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-02-28,Beverages,XYZ,1256.000,301840.902984,0,2,1751.000,5.310318e+05,1126.000,269641.753000,1398.000000,320.276443,3.989992e+05,130715.547043,131339.000,1155.275862,0
716,2022-02-28,Grocery,ABC,3548.268,700557.436867,0,2,3752.998,7.867650e+05,3486.773,725599.851278,3844.351000,410.941869,7.933261e+05,71233.808433,329358.371,1158.324786,0
717,2022-02-28,Grocery,XYZ,4854.414,950084.212008,0,2,7599.386,1.596598e+06,5254.694,985413.796996,5700.358333,1720.056359,1.129335e+06,414486.318350,431678.357,1165.364407,0
718,2022-02-28,Household,ABC,1520.000,394323.614974,0,2,1642.000,4.441275e+05,1508.000,390277.067990,1633.333333,121.232559,4.291357e+05,33943.864780,132651.000,1174.344538,0


In [None]:
import holidays

us_holidays = holidays.US(years=[2021, 2022])
full_df_gb['Is_Holiday'] = full_df_gb['date_id'].apply(lambda x: x in us_holidays).astype(int)


<holidays.registry.EntityLoader at 0x7fd16d740d40>

In [None]:
full_df_gb[full_df_gb['Is_Holiday'].isin([1])]

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2,rolling_mean_qty_3,rolling_std_qty_3,rolling_mean_sales_3,rolling_std_sales_3,cumsum_qty,cummean_qty,Is_Holiday
60,2021-11-11,Beverages,ABC,1017.0,265286.0,3,11,924.0,238515.2,815.0,199577.0,904.333333,81.303956,228359.0,25283.08551,10692.0,1113.721311,1
61,2021-11-11,Beverages,XYZ,1104.0,297145.0,3,11,934.0,259906.5,971.0,263813.854,860.666667,160.132237,241702.1,34969.354609,10181.0,1114.096774,1
62,2021-11-11,Grocery,ABC,2745.732,519793.1,3,11,2876.364,522461.8,2489.046,497413.33899,2564.016,282.42706,486238.9,42915.544141,28538.705,1110.492063,1
63,2021-11-11,Grocery,XYZ,3559.718,627332.1,3,11,3666.634,635110.8,3608.112,622938.069,3384.72,438.585022,585362.2,75869.704518,37403.489,1108.09375,1
64,2021-11-11,Household,ABC,1482.0,342064.0,3,11,1160.0,304131.3,930.0,229917.0,991.0,148.232925,258114.4,40188.190384,11569.0,1103.384615,1
65,2021-11-11,Household,XYZ,1120.0,259689.0,3,11,1082.0,252174.6,1118.0,277746.993,1048.0,91.8477,255812.9,20360.238738,12170.0,1098.560606,1
144,2021-11-25,Beverages,ABC,1302.0,400375.2,3,11,1062.0,347726.5,1202.0,300005.609,1070.0,128.187363,298609.0,49830.502146,24685.0,935.8,1
145,2021-11-25,Beverages,XYZ,1014.0,274148.7,3,11,663.0,192973.5,834.0,217515.70898,739.666667,86.858122,199286.9,16033.213692,23395.0,938.884615,1
146,2021-11-25,Grocery,ABC,3301.456,651393.6,3,11,2093.942,399369.8,2105.91,439004.82804,2154.051333,93.938615,428618.3,25681.901818,62698.211,950.888889,1
147,2021-11-25,Grocery,XYZ,3250.023,573458.0,3,11,2991.714,509016.8,2830.034,503178.39703,2774.166,250.204591,485236.2,36250.680355,82817.855,961.464286,1


In [None]:
#Sales related features -> lag features, rolling window (average across certain time period)
#Item related features -> lag features, rolling windows, most item sold in last days
#time related features -> 

In [None]:
X = train_df[['store', 'item_dept','lag_qty_1', 'lag_sales_1', 'lag_qty_2', 'lag_sales_2']].dropna() #
y_item_qty = train_df[['item_qty']]
y_net_sales = train_df[['net_sales']]

# model.fit(X, y)

In [None]:
X.head(12)

Unnamed: 0,store,item_dept,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
12,ABC,Beverages,1140.0,277904.838,917.0,254547.79901
13,XYZ,Beverages,922.0,251330.13,779.0,187931.76902
14,ABC,Grocery,3021.684,582206.592985,2360.399,437286.62402
15,XYZ,Grocery,3623.716,592195.489002,2987.06,491450.723
16,ABC,Household,1195.0,292020.462024,1043.0,247336.26601
17,XYZ,Household,1196.0,262403.349,1124.0,247156.009988
18,ABC,Beverages,837.0,199145.59601,1140.0,277904.838
19,XYZ,Beverages,841.0,228560.219,922.0,251330.13
20,ABC,Grocery,2155.128,397946.965024,3021.684,582206.592985
21,XYZ,Grocery,3288.1,574684.181,3623.716,592195.489002


In [None]:
X[(X['store'] == 'ABC') & (X['item_dept'] == 'Beverages')]

Unnamed: 0,store,item_dept,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
12,ABC,Beverages,1140.0,277904.83800,917.0,254547.79901
18,ABC,Beverages,837.0,199145.59601,1140.0,277904.83800
24,ABC,Beverages,1256.0,255926.91300,837.0,199145.59601
30,ABC,Beverages,991.0,255695.36500,1256.0,255926.91300
36,ABC,Beverages,884.0,241963.00000,991.0,255695.36500
...,...,...,...,...,...,...
522,ABC,Beverages,1095.0,382074.00000,874.0,253524.00000
528,ABC,Beverages,1253.0,395367.00000,1095.0,382074.00000
534,ABC,Beverages,1319.0,397226.50000,1253.0,395367.00000
540,ABC,Beverages,966.0,213541.73000,1319.0,397226.50000
