# Final Notebook: Data Preprocessing, Modelling, and Inference

In [1]:
import numpy as np
import pandas as pd
import sys
import os

In [2]:
# Setting path to load util functions
from pathlib import Path
parent_dir = Path.cwd().parents[0]
sys.path.append(os.path.abspath(parent_dir))

### Loading Train and Test Data

In [3]:
train_df = pd.read_csv('../../data/training_data.csv')
test_df = pd.read_csv('../../data/test_data.csv')

In [4]:
#Instantiating preprocessor object using DataPreprocessPipeline class
from utils.DataPreprocessPipeline import DataPreprocessPipeline
preprocessor = DataPreprocessPipeline(num_lags=3, rolling_window_size=3)

In [5]:
#merging train_df and test_df to preprocess the data together (so time-series features would be created)
full_df = pd.concat([train_df, test_df])

In [6]:
full_df

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0
...,...,...,...,...,...,...,...
247619,2/28/2022,Grocery,5.0,500.0,XYZ,114873,
247620,2/28/2022,Household,4.0,2361.0,XYZ,117497,
247621,2/28/2022,Grocery,2.0,480.0,XYZ,77298,
247622,2/28/2022,Beverages,1.0,1900.0,XYZ,1444,


In [7]:
#preprocessing the data
full_df_processed = preprocessor.transform(full_df)

In [8]:
full_df_processed

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,rolling_mean_item_qty_3,rolling_std_item_qty_3,...,rolling_std_net_sales_3,cumsum_net_sales,cummean_net_sales,expanding_min_net_sales,expanding_max_net_sales,diff_net_sales,diff_net_sales_7,day_of_week,isWeekend,Is_Holiday
42,2021-11-08,Beverages,ABC,974.000,246984.682000,937.000,884.000,991.000,937.333333,53.500779,...,18717.981669,1.703852e+06,243407.457717,199145.596010,2.779048e+05,-23294.307000,57.000,0,0,0
43,2021-11-08,Beverages,XYZ,677.000,201386.081000,1043.000,1086.000,910.000,1013.000000,91.755109,...,36459.734951,1.675697e+06,239385.355431,187931.769020,2.899930e+05,27566.920000,-102.000,0,0,0
44,2021-11-08,Grocery,ABC,2326.638,438841.438010,2646.820,2614.006,2654.582,2638.469333,21.538411,...,34985.300857,3.473914e+06,496273.373853,397946.965024,5.822066e+05,-43148.157030,-33.761,0,0,0
45,2021-11-08,Grocery,XYZ,2879.414,498037.791000,3608.962,3625.864,3287.828,3507.551333,190.473559,...,52868.205452,4.010397e+06,572913.907431,491450.723000,6.616550e+05,38798.438006,-107.646,0,0,0
46,2021-11-08,Household,ABC,883.000,240294.892010,998.000,1152.000,986.000,1045.333333,92.570694,...,25552.215188,1.775416e+06,253630.794154,209100.513996,2.920205e+05,-50544.119984,-160.000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2022-02-28,Beverages,XYZ,1256.000,301840.902984,1751.000,1126.000,1317.000,1398.000000,320.276443,...,130715.547043,3.410990e+07,286637.847235,149946.090000,5.813210e+05,261390.028980,114.000,0,0,0
716,2022-02-28,Grocery,ABC,3548.268,700557.436867,3752.998,3486.773,4293.282,3844.351000,410.941869,...,71233.808433,6.772851e+07,569147.160637,367440.183000,2.035445e+06,61165.191660,868.876,0,0,0
717,2022-02-28,Grocery,XYZ,4854.414,950084.212008,7599.386,5254.694,4246.995,5700.358333,1720.056359,...,414486.318350,7.845215e+07,659261.805316,443513.481000,1.596598e+06,611184.257052,1239.090,0,0,0
718,2022-02-28,Household,ABC,1520.000,394323.614974,1642.000,1508.000,1750.000,1633.333333,121.232559,...,33943.864780,3.497245e+07,293886.126171,181968.999000,4.530025e+05,53850.418996,275.000,0,0,0


In [9]:
#creating train and test set from preprocessed data
from utils.model_helpers import create_training_testing
train_dict, test_dict = create_training_testing(full_df_processed, test_date_start='2022-02-01')

In [10]:
#Setting training and testing data
train_X = train_dict['train_features']
train_y_sales = train_dict['train_net_sales']
train_y_item_qty = train_dict['train_item_qty']

test_X = test_dict['train_features']
test_y_sales = test_dict['train_net_sales']
test_y_item_qty = test_dict['train_item_qty']

### Modelling

In [9]:
from utils.SalesItemQtyModel import SalesItemQtyModel

In [10]:
#Need to define 2 models to predict sales and item_qty
from catboost import CatBoostRegressor

#params chosen from hp tuning
params = {'bagging_temperature': 0.4772142988105587, 'border_count': 200.0, 'depth': 5.0, 'iterations': 450.0,
          'l2_leaf_reg': 7.43439511907274, 'learning_rate': 0.22947940228640154, 'one_hot_max_size': 124.0,
          'random_strength': 5.316275052028734}

sales_model = CatBoostRegressor(**params, verbose = 0)
item_qty_model = CatBoostRegressor(**params, verbose = 0)

In [11]:
#Defining the model to train and predict both sales and item qty
sales_item_qty_model = SalesItemQtyModel(
    model_sales=sales_model,
    model_item_qty=item_qty_model
)

In [12]:
#Fitting the model
sales_item_qty_model.fit(X = train_X, y_sales = train_y_sales, y_item_qty = train_y_item_qty)

### Model Testing

In [13]:
from utils.model_helpers import get_results
get_results(train_dict=train_dict, valid_dict=test_dict, model_sales=sales_model, model_item_qty=item_qty_model)

Train Set Results...

MAPE for predicting Sales: 0.017803302824813017
MAPE for predicting Item Qty: 0.010763977959730046

Test Set Results...

MAPE for predicting Sales: 0.16504476285341552
MAPE for predicting Item Qty: 0.138895247681839



### Production Level Forecasting

In [14]:
#This is to get the predictions in a real scenario (by forecasting using only the training set)

In [19]:
train_df_gb = preprocessor._groupby_df(train_df)
depts_list = train_df_gb['item_dept'].unique()
stores_list = train_df_gb['store'].unique()

In [20]:
from utils.model_helpers import generate_forecasting_df
feb_forecasted_df = generate_forecasting_df(start_date='2022-02-01', end_date='2022-02-28', depts_list=depts_list,
                                            stores_list=stores_list,historical_df=train_df_gb, dual_model=sales_item_qty_model,
                                            preprocessor=preprocessor)

In [21]:
feb_forecasted_df

Unnamed: 0,date_id,item_dept,store,net_sales,item_qty
0,2022-02-01,Beverages,ABC,242399.069722,897.340114
1,2022-02-01,Beverages,XYZ,207773.181986,837.324206
2,2022-02-01,Grocery,ABC,673400.218517,3173.605688
3,2022-02-01,Grocery,XYZ,772626.697048,3693.125385
4,2022-02-01,Household,ABC,279385.159003,1048.739539
...,...,...,...,...,...
163,2022-02-28,Beverages,XYZ,236301.279538,947.176919
164,2022-02-28,Grocery,ABC,660499.364685,3203.510399
165,2022-02-28,Grocery,XYZ,684497.508376,3461.218363
166,2022-02-28,Household,ABC,260820.851910,1021.760238


In [22]:
test_y_sales.shape, test_y_item_qty.shape

((168,), (168,))

In [23]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(test_y_sales, feb_forecasted_df['net_sales'])

0.21373960318008395

In [24]:
mean_absolute_percentage_error(test_y_item_qty, feb_forecasted_df['item_qty'])

0.20363917970502896