In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

In [2]:
train_df = pd.read_csv('../../data/training_data.csv')
test_df = pd.read_csv('../../data/test_data.csv')

In [3]:
sys.path.append(os.path.abspath('..'))
from utils import data_preprocess


In [4]:
train_df.head()

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0


In [5]:
test_df.head()

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,2/1/2022,Beverages,2.0,480.0,XYZ,112360,1495518.0
1,2/1/2022,Beverages,1.0,202.0,XYZ,111195,1495518.0
2,2/1/2022,Household,1.0,165.0,XYZ,41212,1495572.0
3,2/1/2022,Household,2.0,480.0,XYZ,123476,1495572.0
4,2/1/2022,Grocery,2.0,660.0,XYZ,106668,1495572.0


In [6]:
full_df = pd.concat([train_df, test_df])
full_df

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
247619,2/28/2022,Grocery,5.0,500.0,XYZ,114873,
247620,2/28/2022,Household,4.0,2361.0,XYZ,117497,
247621,2/28/2022,Grocery,2.0,480.0,XYZ,77298,
247622,2/28/2022,Beverages,1.0,1900.0,XYZ,1444,


In [7]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 861722 entries, 0 to 247623
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   date_id      861722 non-null  object 
 1   item_dept    861722 non-null  object 
 2   item_qty     861722 non-null  float64
 3   net_sales    861722 non-null  float64
 4   store        861722 non-null  object 
 5   item         861722 non-null  int64  
 6   invoice_num  830607 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 52.6+ MB


In [14]:
full_df

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
247619,2/28/2022,Grocery,5.0,500.0,XYZ,114873,
247620,2/28/2022,Household,4.0,2361.0,XYZ,117497,
247621,2/28/2022,Grocery,2.0,480.0,XYZ,77298,
247622,2/28/2022,Beverages,1.0,1900.0,XYZ,1444,


In [15]:
data_preprocess.preprocess_data(df=full_df)[['date_id','item_dept', 'store', 'lag_item_qty_1', 'rolling_mean_item_qty_2', 'cumsum_item_qty', 'cummean_item_qty', 'diff_item_qty', 'item_qty']].head(30)

Unnamed: 0,date_id,item_dept,store,lag_item_qty_1,rolling_mean_item_qty_2,cumsum_item_qty,cummean_item_qty,diff_item_qty,item_qty
0,2021-11-01,Beverages,ABC,,,,,,917.0
1,2021-11-01,Beverages,XYZ,,,,,,779.0
2,2021-11-01,Grocery,ABC,,,,,,2360.399
3,2021-11-01,Grocery,XYZ,,,,,,2987.06
4,2021-11-01,Household,ABC,,,,,,1043.0
5,2021-11-01,Household,XYZ,,,,,,1124.0
6,2021-11-02,Beverages,ABC,917.0,,917.0,917.0,,1140.0
7,2021-11-02,Beverages,XYZ,779.0,,779.0,779.0,,922.0
8,2021-11-02,Grocery,ABC,2360.399,,2360.399,2360.399,,3021.684
9,2021-11-02,Grocery,XYZ,2987.06,,2987.06,2987.06,,3623.716


In [76]:
# Preprocessing
full_df['date_id'] = pd.to_datetime(full_df['date_id'])
full_df_gb = full_df.groupby(['date_id', 'item_dept', 'store'])[['item_qty', 'net_sales']].sum().reset_index()

In [77]:
full_df_gb.describe()

Unnamed: 0,date_id,item_qty,net_sales
count,720,720.0,720.0
mean,2021-12-30 12:00:00,1818.321844,402001.6
min,2021-11-01 00:00:00,645.0,149946.1
25%,2021-11-30 18:00:00,1027.75,255959.8
50%,2021-12-30 12:00:00,1280.0,336784.6
75%,2022-01-29 06:00:00,2604.1745,522575.4
max,2022-02-28 00:00:00,7787.289,2035445.0
std,,1090.859874,194157.1


In [78]:
full_df_gb.head(18)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales
0,2021-11-01,Beverages,ABC,917.0,254547.79901
1,2021-11-01,Beverages,XYZ,779.0,187931.76902
2,2021-11-01,Grocery,ABC,2360.399,437286.62402
3,2021-11-01,Grocery,XYZ,2987.06,491450.723
4,2021-11-01,Household,ABC,1043.0,247336.26601
5,2021-11-01,Household,XYZ,1124.0,247156.009988
6,2021-11-02,Beverages,ABC,1140.0,277904.838
7,2021-11-02,Beverages,XYZ,922.0,251330.13
8,2021-11-02,Grocery,ABC,3021.684,582206.592985
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002


In [22]:
full_df_gb.head(18)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales
0,2021-11-01,Beverages,ABC,917.0,254547.79901
1,2021-11-01,Beverages,XYZ,779.0,187931.76902
2,2021-11-01,Grocery,ABC,2360.399,437286.62402
3,2021-11-01,Grocery,XYZ,2987.06,491450.723
4,2021-11-01,Household,ABC,1043.0,247336.26601
5,2021-11-01,Household,XYZ,1124.0,247156.009988
6,2021-11-02,Beverages,ABC,1140.0,277904.838
7,2021-11-02,Beverages,XYZ,922.0,251330.13
8,2021-11-02,Grocery,ABC,3021.684,582206.592985
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002


In [88]:
full_df_gb_lags = data_preprocess.create_lag_features(df=full_df_gb, feature_name='item_qty', num_lags=2)

In [93]:
data_preprocess.create_expanding_window_features(df=full_df_gb_lags, feature_name='item_qty').head(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,expanding_min_item_qty,expanding_max_item_qty
0,2021-11-01,Beverages,ABC,917.0,254547.79901,,,,
1,2021-11-01,Beverages,XYZ,779.0,187931.76902,,,,
2,2021-11-01,Grocery,ABC,2360.399,437286.62402,,,,
3,2021-11-01,Grocery,XYZ,2987.06,491450.723,,,,
4,2021-11-01,Household,ABC,1043.0,247336.26601,,,,
5,2021-11-01,Household,XYZ,1124.0,247156.009988,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838,917.0,,917.0,917.0
7,2021-11-02,Beverages,XYZ,922.0,251330.13,779.0,,779.0,779.0
8,2021-11-02,Grocery,ABC,3021.684,582206.592985,2360.399,,2360.399,2360.399
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002,2987.06,,2987.06,2987.06


In [23]:
# Feature Engineering
full_df_gb['day_of_week'] = full_df_gb['date_id'].dt.dayofweek
full_df_gb['month'] = full_df_gb['date_id'].dt.month

In [24]:
full_df_gb

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month
0,2021-11-01,Beverages,ABC,917.000,254547.799010,0,11
1,2021-11-01,Beverages,XYZ,779.000,187931.769020,0,11
2,2021-11-01,Grocery,ABC,2360.399,437286.624020,0,11
3,2021-11-01,Grocery,XYZ,2987.060,491450.723000,0,11
4,2021-11-01,Household,ABC,1043.000,247336.266010,0,11
...,...,...,...,...,...,...,...
715,2022-02-28,Beverages,XYZ,1256.000,301840.902984,0,2
716,2022-02-28,Grocery,ABC,3548.268,700557.436867,0,2
717,2022-02-28,Grocery,XYZ,4854.414,950084.212008,0,2
718,2022-02-28,Household,ABC,1520.000,394323.614974,0,2


In [25]:
# Lag features
full_df_gb['lag_qty_1'] = full_df_gb.groupby(['item_dept', 'store'])['item_qty'].shift(1)
full_df_gb['lag_sales_1'] = full_df_gb.groupby(['item_dept', 'store'])['net_sales'].shift(1)

In [26]:
full_df_gb.head(12)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1
0,2021-11-01,Beverages,ABC,917.0,254547.79901,0,11,,
1,2021-11-01,Beverages,XYZ,779.0,187931.76902,0,11,,
2,2021-11-01,Grocery,ABC,2360.399,437286.62402,0,11,,
3,2021-11-01,Grocery,XYZ,2987.06,491450.723,0,11,,
4,2021-11-01,Household,ABC,1043.0,247336.26601,0,11,,
5,2021-11-01,Household,XYZ,1124.0,247156.009988,0,11,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838,1,11,917.0,254547.79901
7,2021-11-02,Beverages,XYZ,922.0,251330.13,1,11,779.0,187931.76902
8,2021-11-02,Grocery,ABC,3021.684,582206.592985,1,11,2360.399,437286.62402
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002,1,11,2987.06,491450.723


In [40]:
full_df_gb['expanding_min_qty'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cummin()
full_df_gb['expanding_max_qty'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cummax()
full_df_gb['expanding_prod'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cumprod()

In [67]:
full_df_gb['cumsum_qty'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].cumsum()
full_df_gb['cummean_qty'] = full_df_gb['cumsum_qty'] / full_df_gb.groupby(['store', 'item_dept'])['lag_qty_1'].cumcount()


In [None]:
df_process[f'diff_{feature_name}_7'] = df_process.groupby(['item_dept', 'store'])['item_qty'].diff(7)

In [68]:
full_df_gb[(full_df_gb['store'] == 'ABC') & (full_df_gb['item_dept'] == 'Beverages')].head(10)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,expanding_min_qty,cumsum_qty,cummean_qty,expanding_max_qty,expanding_prod
0,2021-11-01,Beverages,ABC,917.0,254547.79901,0,11,,,,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838,1,11,917.0,254547.79901,917.0,917.0,917.0,917.0,917.0
12,2021-11-03,Beverages,ABC,837.0,199145.59601,2,11,1140.0,277904.838,917.0,2057.0,1028.5,1140.0,1045380.0
18,2021-11-04,Beverages,ABC,1256.0,255926.913,3,11,837.0,199145.59601,837.0,2894.0,964.666667,1140.0,874983100.0
24,2021-11-05,Beverages,ABC,991.0,255695.365,4,11,1256.0,255926.913,837.0,4150.0,1037.5,1256.0,1098979000000.0
30,2021-11-06,Beverages,ABC,884.0,241963.0,5,11,991.0,255695.365,837.0,5141.0,1028.2,1256.0,1089088000000000.0
36,2021-11-07,Beverages,ABC,937.0,218668.693,6,11,884.0,241963.0,837.0,6025.0,1004.166667,1256.0,9.627537e+17
42,2021-11-08,Beverages,ABC,974.0,246984.682,0,11,937.0,218668.693,837.0,6962.0,994.571429,1256.0,9.021002e+20
48,2021-11-09,Beverages,ABC,815.0,199577.0,1,11,974.0,246984.682,837.0,7936.0,992.0,1256.0,8.786456e+23
54,2021-11-10,Beverages,ABC,924.0,238515.243,2,11,815.0,199577.0,815.0,8751.0,972.333333,1256.0,7.160962e+26


In [15]:
# Lag features
full_df_gb['lag_qty_2'] = full_df_gb.groupby(['item_dept', 'store'])['item_qty'].shift(2)
full_df_gb['lag_sales_2'] = full_df_gb.groupby(['item_dept', 'store'])['net_sales'].shift(2)

In [16]:
full_df_gb[(full_df_gb['item_dept'] == 'Beverages') & (full_df_gb['store'] == 'ABC')]

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
0,2021-11-01,Beverages,ABC,917.0,254547.799010,0,11,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838000,1,11,917.0,254547.799010,,
12,2021-11-03,Beverages,ABC,837.0,199145.596010,2,11,1140.0,277904.838000,917.0,254547.79901
18,2021-11-04,Beverages,ABC,1256.0,255926.913000,3,11,837.0,199145.596010,1140.0,277904.83800
24,2021-11-05,Beverages,ABC,991.0,255695.365000,4,11,1256.0,255926.913000,837.0,199145.59601
...,...,...,...,...,...,...,...,...,...,...,...
690,2022-02-24,Beverages,ABC,1531.0,562260.630000,3,2,1094.0,250334.291980,1464.0,409491.97300
696,2022-02-25,Beverages,ABC,1358.0,331291.918000,4,2,1531.0,562260.630000,1094.0,250334.29198
702,2022-02-26,Beverages,ABC,1267.0,346495.176030,5,2,1358.0,331291.918000,1531.0,562260.63000
708,2022-02-27,Beverages,ABC,1535.0,448158.400996,6,2,1267.0,346495.176030,1358.0,331291.91800


In [17]:
full_df_gb[['date_id','item_dept','lag_qty_1', 'lag_qty_2', 'lag_sales_1', 'lag_sales_2', 'item_qty', 'net_sales']].dropna().head(20)

Unnamed: 0,date_id,item_dept,lag_qty_1,lag_qty_2,lag_sales_1,lag_sales_2,item_qty,net_sales
12,2021-11-03,Beverages,1140.0,917.0,277904.838,254547.79901,837.0,199145.59601
13,2021-11-03,Beverages,922.0,779.0,251330.13,187931.76902,841.0,228560.219
14,2021-11-03,Grocery,3021.684,2360.399,582206.592985,437286.62402,2155.128,397946.965024
15,2021-11-03,Grocery,3623.716,2987.06,592195.489002,491450.723,3288.1,574684.181
16,2021-11-03,Household,1195.0,1043.0,292020.462024,247336.26601,819.0,209100.513996
17,2021-11-03,Household,1196.0,1124.0,262403.349,247156.009988,1108.0,249315.458
18,2021-11-04,Beverages,837.0,1140.0,199145.59601,277904.838,1256.0,255926.913
19,2021-11-04,Beverages,841.0,922.0,228560.219,251330.13,914.0,237710.29
20,2021-11-04,Grocery,2155.128,3021.684,397946.965024,582206.592985,2648.306,522916.128
21,2021-11-04,Grocery,3288.1,3623.716,574684.181,592195.489002,3268.081,510482.668


In [18]:
full_df_gb[(full_df_gb['item_dept'] == 'Beverages') & (full_df_gb['store'] == 'ABC')]

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
0,2021-11-01,Beverages,ABC,917.0,254547.799010,0,11,,,,
6,2021-11-02,Beverages,ABC,1140.0,277904.838000,1,11,917.0,254547.799010,,
12,2021-11-03,Beverages,ABC,837.0,199145.596010,2,11,1140.0,277904.838000,917.0,254547.79901
18,2021-11-04,Beverages,ABC,1256.0,255926.913000,3,11,837.0,199145.596010,1140.0,277904.83800
24,2021-11-05,Beverages,ABC,991.0,255695.365000,4,11,1256.0,255926.913000,837.0,199145.59601
...,...,...,...,...,...,...,...,...,...,...,...
690,2022-02-24,Beverages,ABC,1531.0,562260.630000,3,2,1094.0,250334.291980,1464.0,409491.97300
696,2022-02-25,Beverages,ABC,1358.0,331291.918000,4,2,1531.0,562260.630000,1094.0,250334.29198
702,2022-02-26,Beverages,ABC,1267.0,346495.176030,5,2,1358.0,331291.918000,1531.0,562260.63000
708,2022-02-27,Beverages,ABC,1535.0,448158.400996,6,2,1267.0,346495.176030,1358.0,331291.91800


In [19]:
full_df_gb.columns

Index(['date_id', 'item_dept', 'store', 'item_qty', 'net_sales', 'day_of_week',
       'month', 'lag_qty_1', 'lag_sales_1', 'lag_qty_2', 'lag_sales_2'],
      dtype='object')

In [24]:
#rolling window creation
# Rolling mean and standard deviation for item_qty
window_size  = 3
full_df_gb[f'rolling_mean_qty_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].transform(lambda x: x.rolling(window=window_size).mean())
full_df_gb[f'rolling_std_qty_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_qty_1'].transform(lambda x: x.rolling(window=window_size).std())

# Rolling mean and standard deviation for net_sales
full_df_gb[f'rolling_mean_sales_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_sales_1'].transform(lambda x: x.rolling(window=window_size).mean())
full_df_gb[f'rolling_std_sales_{window_size}'] = full_df_gb.groupby(['item_dept', 'store'])['lag_sales_1'].transform(lambda x: x.rolling(window=window_size).std())


In [33]:
full_df_gb.head(20)

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2,rolling_mean_qty_3,rolling_std_qty_3,rolling_mean_sales_3,rolling_std_sales_3,cumsum_qty,cummean_qty,Is_Holiday
0,2021-11-01,Beverages,ABC,917.0,254547.79901,0,11,,,,,,,,,917.0,917.0,False
1,2021-11-01,Beverages,XYZ,779.0,187931.76902,0,11,,,,,,,,,779.0,1028.5,False
2,2021-11-01,Grocery,ABC,2360.399,437286.62402,0,11,,,,,,,,,2360.399,964.666667,False
3,2021-11-01,Grocery,XYZ,2987.06,491450.723,0,11,,,,,,,,,2987.06,1037.5,False
4,2021-11-01,Household,ABC,1043.0,247336.26601,0,11,,,,,,,,,1043.0,1028.2,False
5,2021-11-01,Household,XYZ,1124.0,247156.009988,0,11,,,,,,,,,1124.0,1004.166667,False
6,2021-11-02,Beverages,ABC,1140.0,277904.838,1,11,917.0,254547.79901,,,,,,,2057.0,994.571429,False
7,2021-11-02,Beverages,XYZ,922.0,251330.13,1,11,779.0,187931.76902,,,,,,,1701.0,992.0,False
8,2021-11-02,Grocery,ABC,3021.684,582206.592985,1,11,2360.399,437286.62402,,,,,,,5382.083,972.333333,False
9,2021-11-02,Grocery,XYZ,3623.716,592195.489002,1,11,2987.06,491450.723,,,,,,,6610.776,967.5,False


In [35]:
full_df_gb

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2,rolling_mean_qty_3,rolling_std_qty_3,rolling_mean_sales_3,rolling_std_sales_3,cumsum_qty,cummean_qty,Is_Holiday
0,2021-11-01,Beverages,ABC,917.000,254547.799010,0,11,,,,,,,,,917.000,917.000000,0
1,2021-11-01,Beverages,XYZ,779.000,187931.769020,0,11,,,,,,,,,779.000,1028.500000,0
2,2021-11-01,Grocery,ABC,2360.399,437286.624020,0,11,,,,,,,,,2360.399,964.666667,0
3,2021-11-01,Grocery,XYZ,2987.060,491450.723000,0,11,,,,,,,,,2987.060,1037.500000,0
4,2021-11-01,Household,ABC,1043.000,247336.266010,0,11,,,,,,,,,1043.000,1028.200000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-02-28,Beverages,XYZ,1256.000,301840.902984,0,2,1751.000,5.310318e+05,1126.000,269641.753000,1398.000000,320.276443,3.989992e+05,130715.547043,131339.000,1155.275862,0
716,2022-02-28,Grocery,ABC,3548.268,700557.436867,0,2,3752.998,7.867650e+05,3486.773,725599.851278,3844.351000,410.941869,7.933261e+05,71233.808433,329358.371,1158.324786,0
717,2022-02-28,Grocery,XYZ,4854.414,950084.212008,0,2,7599.386,1.596598e+06,5254.694,985413.796996,5700.358333,1720.056359,1.129335e+06,414486.318350,431678.357,1165.364407,0
718,2022-02-28,Household,ABC,1520.000,394323.614974,0,2,1642.000,4.441275e+05,1508.000,390277.067990,1633.333333,121.232559,4.291357e+05,33943.864780,132651.000,1174.344538,0


In [40]:
import holidays

us_holidays = holidays.US(years=[2021, 2022])
full_df_gb['Is_Holiday'] = full_df_gb['date_id'].apply(lambda x: x in us_holidays).astype(int)


<holidays.registry.EntityLoader at 0x7fd16d740d40>

In [41]:
full_df_gb[full_df_gb['Is_Holiday'].isin([1])]

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,day_of_week,month,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2,rolling_mean_qty_3,rolling_std_qty_3,rolling_mean_sales_3,rolling_std_sales_3,cumsum_qty,cummean_qty,Is_Holiday
60,2021-11-11,Beverages,ABC,1017.0,265286.0,3,11,924.0,238515.2,815.0,199577.0,904.333333,81.303956,228359.0,25283.08551,10692.0,1113.721311,1
61,2021-11-11,Beverages,XYZ,1104.0,297145.0,3,11,934.0,259906.5,971.0,263813.854,860.666667,160.132237,241702.1,34969.354609,10181.0,1114.096774,1
62,2021-11-11,Grocery,ABC,2745.732,519793.1,3,11,2876.364,522461.8,2489.046,497413.33899,2564.016,282.42706,486238.9,42915.544141,28538.705,1110.492063,1
63,2021-11-11,Grocery,XYZ,3559.718,627332.1,3,11,3666.634,635110.8,3608.112,622938.069,3384.72,438.585022,585362.2,75869.704518,37403.489,1108.09375,1
64,2021-11-11,Household,ABC,1482.0,342064.0,3,11,1160.0,304131.3,930.0,229917.0,991.0,148.232925,258114.4,40188.190384,11569.0,1103.384615,1
65,2021-11-11,Household,XYZ,1120.0,259689.0,3,11,1082.0,252174.6,1118.0,277746.993,1048.0,91.8477,255812.9,20360.238738,12170.0,1098.560606,1
144,2021-11-25,Beverages,ABC,1302.0,400375.2,3,11,1062.0,347726.5,1202.0,300005.609,1070.0,128.187363,298609.0,49830.502146,24685.0,935.8,1
145,2021-11-25,Beverages,XYZ,1014.0,274148.7,3,11,663.0,192973.5,834.0,217515.70898,739.666667,86.858122,199286.9,16033.213692,23395.0,938.884615,1
146,2021-11-25,Grocery,ABC,3301.456,651393.6,3,11,2093.942,399369.8,2105.91,439004.82804,2154.051333,93.938615,428618.3,25681.901818,62698.211,950.888889,1
147,2021-11-25,Grocery,XYZ,3250.023,573458.0,3,11,2991.714,509016.8,2830.034,503178.39703,2774.166,250.204591,485236.2,36250.680355,82817.855,961.464286,1


In [None]:
#Sales related features -> lag features, rolling window (average across certain time period)
#Item related features -> lag features, rolling windows, most item sold in last days
#time related features -> 

In [53]:
X = train_df[['store', 'item_dept','lag_qty_1', 'lag_sales_1', 'lag_qty_2', 'lag_sales_2']].dropna() #
y_item_qty = train_df[['item_qty']]
y_net_sales = train_df[['net_sales']]

# model.fit(X, y)

In [55]:
X.head(12)

Unnamed: 0,store,item_dept,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
12,ABC,Beverages,1140.0,277904.838,917.0,254547.79901
13,XYZ,Beverages,922.0,251330.13,779.0,187931.76902
14,ABC,Grocery,3021.684,582206.592985,2360.399,437286.62402
15,XYZ,Grocery,3623.716,592195.489002,2987.06,491450.723
16,ABC,Household,1195.0,292020.462024,1043.0,247336.26601
17,XYZ,Household,1196.0,262403.349,1124.0,247156.009988
18,ABC,Beverages,837.0,199145.59601,1140.0,277904.838
19,XYZ,Beverages,841.0,228560.219,922.0,251330.13
20,ABC,Grocery,2155.128,397946.965024,3021.684,582206.592985
21,XYZ,Grocery,3288.1,574684.181,3623.716,592195.489002


In [56]:
X[(X['store'] == 'ABC') & (X['item_dept'] == 'Beverages')]

Unnamed: 0,store,item_dept,lag_qty_1,lag_sales_1,lag_qty_2,lag_sales_2
12,ABC,Beverages,1140.0,277904.83800,917.0,254547.79901
18,ABC,Beverages,837.0,199145.59601,1140.0,277904.83800
24,ABC,Beverages,1256.0,255926.91300,837.0,199145.59601
30,ABC,Beverages,991.0,255695.36500,1256.0,255926.91300
36,ABC,Beverages,884.0,241963.00000,991.0,255695.36500
...,...,...,...,...,...,...
522,ABC,Beverages,1095.0,382074.00000,874.0,253524.00000
528,ABC,Beverages,1253.0,395367.00000,1095.0,382074.00000
534,ABC,Beverages,1319.0,397226.50000,1253.0,395367.00000
540,ABC,Beverages,966.0,213541.73000,1319.0,397226.50000
