# Final Notebook: Data Preprocessing, Modelling, and Inference

In [1]:
import numpy as np
import pandas as pd
import sys
import os

In [2]:
# Setting path to load util functions
from pathlib import Path
parent_dir = Path.cwd().parents[0]
sys.path.append(os.path.abspath(parent_dir))

### Loading Train and Test Data

In [3]:
train_df = pd.read_csv('../../data/training_data.csv')
test_df = pd.read_csv('../../data/test_data.csv')

In [4]:
#Instantiating preprocessor object using DataPreprocessPipeline class
from utils.DataPreprocessPipeline import DataPreprocessPipeline
preprocessor = DataPreprocessPipeline(num_lags=3, rolling_window_size=3)

In [5]:
#merging train_df and test_df to preprocess the data together (so time-series features would be created)
full_df = pd.concat([train_df, test_df])

In [6]:
#preprocessing the data
full_df_processed = preprocessor.transform(full_df)

In [7]:
#creating train and test set from preprocessed data
from utils.model_helpers import create_training_testing
train_dict, test_dict = create_training_testing(full_df_processed, test_date_start='2022-02-01')

In [8]:
#Setting training and testing data
train_X = train_dict['train_features']
train_y_sales = train_dict['train_net_sales']
train_y_item_qty = train_dict['train_item_qty']

test_X = test_dict['train_features']
test_y_sales = test_dict['train_net_sales']
test_y_item_qty = test_dict['train_item_qty']

### Modelling

In [9]:
from utils.SalesItemQtyModel import SalesItemQtyModel

In [10]:
#Need to define 2 models to predict sales and item_qty
from catboost import CatBoostRegressor

#params chosen from hp tuning
params = {'bagging_temperature': 0.4772142988105587, 'border_count': 200.0, 'depth': 5.0, 'iterations': 450.0,
          'l2_leaf_reg': 7.43439511907274, 'learning_rate': 0.22947940228640154, 'one_hot_max_size': 124.0,
          'random_strength': 5.316275052028734}

sales_model = CatBoostRegressor(**params, verbose = 0)
item_qty_model = CatBoostRegressor(**params, verbose = 0)

In [11]:
#Defining the model to train and predict both sales and item qty
sales_item_qty_model = SalesItemQtyModel(
    model_sales=sales_model,
    model_item_qty=item_qty_model
)

In [12]:
#Fitting the model
sales_item_qty_model.fit(X = train_X, y_sales = train_y_sales, y_item_qty = train_y_item_qty)

### Model Testing

In [13]:
from utils.model_helpers import get_results
get_results(train_dict=train_dict, valid_dict=test_dict, model_sales=sales_model, model_item_qty=item_qty_model)

Train Set Results...

MAPE for predicting Sales: 0.017803302824813017
MAPE for predicting Item Qty: 0.010763977959730046

Test Set Results...

MAPE for predicting Sales: 0.16504476285341552
MAPE for predicting Item Qty: 0.138895247681839



### Production Level Forecasting

In [14]:
#This is to get the predictions in a real scenario (by forecasting using only the training set)

In [19]:
train_df_gb = preprocessor._groupby_df(train_df)
depts_list = train_df_gb['item_dept'].unique()
stores_list = train_df_gb['store'].unique()

In [20]:
from utils.model_helpers import generate_forecasting_df
feb_forecasted_df = generate_forecasting_df(start_date='2022-02-01', end_date='2022-02-28', depts_list=depts_list,
                                            stores_list=stores_list,historical_df=train_df_gb, dual_model=sales_item_qty_model,
                                            preprocessor=preprocessor)

In [21]:
feb_forecasted_df

Unnamed: 0,date_id,item_dept,store,net_sales,item_qty
0,2022-02-01,Beverages,ABC,242399.069722,897.340114
1,2022-02-01,Beverages,XYZ,207773.181986,837.324206
2,2022-02-01,Grocery,ABC,673400.218517,3173.605688
3,2022-02-01,Grocery,XYZ,772626.697048,3693.125385
4,2022-02-01,Household,ABC,279385.159003,1048.739539
...,...,...,...,...,...
163,2022-02-28,Beverages,XYZ,236301.279538,947.176919
164,2022-02-28,Grocery,ABC,660499.364685,3203.510399
165,2022-02-28,Grocery,XYZ,684497.508376,3461.218363
166,2022-02-28,Household,ABC,260820.851910,1021.760238


In [22]:
test_y_sales.shape, test_y_item_qty.shape

((168,), (168,))

In [23]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(test_y_sales, feb_forecasted_df['net_sales'])

0.21373960318008395

In [24]:
mean_absolute_percentage_error(test_y_item_qty, feb_forecasted_df['item_qty'])

0.20363917970502896