
# 🏪 Store Sales Time Series Forecasting 📈

### 1. Introduction
#### 1.1. Problem Statement
#### 1.2. Data Description
#### 1.3. Objective
### 2. Data Exploration
#### 2.1. Importing Libraries
#### 2.2. Loading Data
#### 2.3. Data Exploration
##### 2.3.1. Univariant Analysis
##### 2.3.2. Bivariant Analysis
##### 2.3.3. Multivariant Analysis
### 3. Data Cleaning
### 4. Feature Engineering
### 5. Data Preprocessing
#### 5.1. Data Transformation
### 6. Model Building
#### 6.1. Baseline Model
### 7. Model Evaluation


We have already covered the data exploration and data cleaning parts in the previous notebook. In this notebook, we will be performing the following tasks:
* Feature Engineering
* Data Preprocessing

In [1]:
# Loading libraries
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import datetime
import os
import warnings

sns.set_context('notebook', font_scale=1.5)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../../input/store_sales_time_series_forecasting/train_merged.csv')
test = pd.read_csv('../../input/store_sales_time_series_forecasting/test_merged.csv')

In [3]:
train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,type_local,type_national,type_reg,dcoilwtico,is_national_holiday
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999
1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999
2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999
3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999
4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999


In [4]:
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,city,state,type,cluster,type_local,type_national,type_reg,dcoilwtico
0,3000888,2017-08-16,1,AUTOMOTIVE,0,Quito,Pichincha,D,13,Work Day,Work Day,Work Day,46.8
1,3000889,2017-08-16,1,BABY CARE,0,Quito,Pichincha,D,13,Work Day,Work Day,Work Day,46.8
2,3000890,2017-08-16,1,BEAUTY,2,Quito,Pichincha,D,13,Work Day,Work Day,Work Day,46.8
3,3000891,2017-08-16,1,BEVERAGES,20,Quito,Pichincha,D,13,Work Day,Work Day,Work Day,46.8
4,3000892,2017-08-16,1,BOOKS,0,Quito,Pichincha,D,13,Work Day,Work Day,Work Day,46.8


## 4. Feature Engineering

* Let's create some Lag features for the target variable `Weekly_Sales`:
    * sales_1_year_ago
    * sales_1_month_ago
    * sales_2_weeks_ago
    * sales_1_week_ago

We need to convert **date** column to **datetime** format before creating lag features. 

In [5]:
train['date'] = pd.to_datetime(train.date)

In [6]:
# define a function that will return calculate the units sold number of a particular product from a particular store.
# We need to calculate this for N number of days

def get_lag_feature(data, no_of_days, return_Series= False):
    data_copy = data.copy()
    sample_1 = data_copy[['date', 'store_nbr', 'family', 'sales']]
    data_copy['NEW_DATE'] = data_copy.date + timedelta(days = no_of_days)
    data_copy['PAST_DATE'] = data_copy.date
    
    sample_2 = data_copy[['NEW_DATE','PAST_DATE', 'store_nbr', 'family', 'sales']]
    final = sample_1.merge(sample_2, how = 'left', left_on = ['date', 'store_nbr', 'family'], right_on = ['NEW_DATE', 'store_nbr', 'family'])
    final = final.drop(columns=['NEW_DATE'])
    final.fillna(0,inplace = True)
    
    if return_Series:
        return final['sales_y']
    else: return final

In [7]:
#  create a feature sales_1_year_ago
train['sales_1_year_ago'] = get_lag_feature(train, no_of_days=365, return_Series= True)

In [8]:
#  create a feature sales_1_month_ago
train['sales_1_month_ago'] = get_lag_feature(train, no_of_days=30, return_Series= True)

In [9]:
#  create a feature sales_2_week_ago
train['sales_2_week_ago'] = get_lag_feature(train, no_of_days=15, return_Series= True)

In [10]:
#  create a feature sales_1_week_ago
train['sales_1_week_ago'] = get_lag_feature(train, no_of_days=7, return_Series= True)

In [11]:
train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,type_local,type_national,type_reg,dcoilwtico,is_national_holiday,sales_1_year_ago,sales_1_month_ago,sales_2_week_ago,sales_1_week_ago
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999,0.0,0.0,0.0,0.0
1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999,0.0,0.0,0.0,0.0
2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999,0.0,0.0,0.0,0.0
3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999,0.0,0.0,0.0,0.0
4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,Work Day,Holiday,Work Day,93.14,99999,0.0,0.0,0.0,0.0


* Let's create a feature **average_in_2_months** which will be the average of daily sales for the last 2 months.

In [17]:
# make 2 columns one with 63 days difference from the week end date and another with 7 days difference
train['2_MONTH_BEFORE'] = train.date - timedelta(days=63)
# current_date = train['date'].max()
# two_months_ago = current_date - pd.DateOffset(months=2)
# two_months_ago

In [18]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
# calculate the average units in the period
def get_average_sales(x):
    data_2month = train[(train['date'] >= x['2_MONTH_BEFORE']) & (train['store_nbr'] == x['store_nbr']) & (train['family'] == x['family']) ]
    return data_2month.sales.mean() 

# train[(train['date'] >= train['2_MONTH_BEFORE'])]

In [20]:
# train['average_in_2_months'] = train.progress_apply(get_average_sales,axis=1)


In [21]:
# fill the null values with 0
# train.average_in_2_months.fillna(0,inplace=True)

In [22]:
# drop the date columns that we have created, as they are of no use now.
train.drop(columns=['2_MONTH_BEFORE'], inplace=True)

In [25]:
train.dtypes

date                   datetime64[ns]
store_nbr                       int64
family                         object
sales                         float64
onpromotion                     int64
city                           object
state                          object
type                           object
cluster                         int64
type_local                     object
type_national                  object
type_reg                       object
dcoilwtico                    float64
is_national_holiday             int64
sales_1_year_ago              float64
sales_1_month_ago             float64
sales_2_week_ago              float64
sales_1_week_ago              float64
dtype: object

* Time related features

In [56]:
train["year"] = train.date.dt.year
train["month"] = train.date.dt.month
train["quarter"] = train.date.dt.quarter
train["week_of_year"] = train.date.dt.weekofyear
train["day_of_year"] = train.date.dt.dayofyear
train["day_of_month"] = train.date.dt.day
train["day_of_week"] = train.date.dt.weekday
train["is_weekend"] = train.date.dt.weekday >= 5
train["is_wage_day"] = train.date.apply(lambda x: (x.is_month_end == True) or (x.day == 15))
train["is_year_end"] = train.date.dt.is_year_end

In [57]:
train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,type_local,...,year,month,quarter,week_of_year,day_of_year,day_of_month,day_of_week,is_weekend,is_wage_day,is_year_end
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,Work Day,...,2013,1,1,1,1,1,1,False,False,False
1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,Work Day,...,2013,1,1,1,1,1,1,False,False,False
2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,Work Day,...,2013,1,1,1,1,1,1,False,False,False
3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,Work Day,...,2013,1,1,1,1,1,1,False,False,False
4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,Work Day,...,2013,1,1,1,1,1,1,False,False,False


In [73]:
train.type_local.value_counts()

Work Day      2989008
Holiday          8085
Additional       3696
Transfer           99
Name: type_local, dtype: int64

In [68]:
# Let's create a column to indicate if there was a holiday on the date
train['is_local_holiday'] = train.type_local.apply(lambda x: 1 if x in ('Holiday', 'Additional', 'Transfer') else 0)
train.is_local_holiday.fillna(0, inplace=True)
train.is_local_holiday.value_counts()

0    2989008
1      11880
Name: is_local_holiday, dtype: int64

In [71]:
train.type_reg.value_counts()

Work Day    2999865
Holiday        1023
Name: type_reg, dtype: int64

In [74]:
# Let's create a column to indicate if there was a holiday on the date
train['is_reg_holiday'] = train.type_reg.apply(lambda x: 1 if x in ('Holiday') else 0)
train.is_reg_holiday.fillna(0, inplace=True)
train.is_reg_holiday.value_counts()

0    2999865
1       1023
Name: is_reg_holiday, dtype: int64

In [79]:
train_final = train.drop(columns=['date', 'sales_1_year_ago', 'sales_1_month_ago',
       'sales_2_week_ago', 'sales_1_week_ago'])

In [80]:
train_final.columns

Index(['store_nbr', 'family', 'sales', 'onpromotion', 'city', 'state', 'type',
       'cluster', 'type_local', 'type_national', 'type_reg', 'dcoilwtico',
       'is_national_holiday', 'year', 'month', 'quarter', 'week_of_year',
       'day_of_year', 'day_of_month', 'day_of_week', 'is_weekend',
       'is_wage_day', 'is_year_end', 'is_local_holiday', 'is_reg_holiday'],
      dtype='object')

In [81]:
train_final.shape

(3000888, 25)

In [62]:
test.date = pd.to_datetime(test.date)
test["year"] = test.date.dt.year
test["month"] = test.date.dt.month
test["quarter"] = test.date.dt.quarter
test["week_of_year"] = test.date.dt.weekofyear
test["day_of_year"] = test.date.dt.dayofyear
test["day_of_month"] = test.date.dt.day
test["day_of_week"] = test.date.dt.weekday
test["is_weekend"] = test.date.dt.weekday >= 5
test["is_wage_day"] = test.date.apply(lambda x: (x.is_month_end == True) or (x.day == 15))
test["is_year_end"] = test.date.dt.is_year_end

In [82]:
# Let's create a column to indicate if there was a holiday on the date
test['is_local_holiday'] = test.type_local.apply(lambda x: 1 if x in ('Holiday', 'Additional', 'Transfer') else 0)
test.is_local_holiday.fillna(0, inplace=True)
print(test.is_local_holiday.value_counts())

test['is_reg_holiday'] = test.type_reg.apply(lambda x: 1 if x in ('Holiday') else 0)
test.is_reg_holiday.fillna(0, inplace=True)
print(test.is_reg_holiday.value_counts())

test['is_national_holiday'] = test.type_national.apply(lambda x: 1 if x in ('Holiday', 'Additional', 'Transfer', 'Event', 'Bridge') else 0)
test.is_national_holiday.fillna(0, inplace=True)
print(test.is_national_holiday.value_counts())

0    28446
1       66
Name: is_local_holiday, dtype: int64
0    28512
Name: is_reg_holiday, dtype: int64
0    28512
Name: is_national_holiday, dtype: int64


In [83]:
test.columns

Index(['id', 'date', 'store_nbr', 'family', 'onpromotion', 'city', 'state',
       'type', 'cluster', 'type_local', 'type_national', 'type_reg',
       'dcoilwtico', 'year', 'month', 'quarter', 'week_of_year', 'day_of_year',
       'day_of_month', 'day_of_week', 'is_weekend', 'is_wage_day',
       'is_year_end', 'is_local_holiday', 'is_reg_holiday',
       'is_national_holiday'],
      dtype='object')

In [84]:
test_final = test.drop(columns=['date'])
test_final.shape 

(28512, 25)

In [85]:
test.to_csv('../../input/store_sales_time_series_forecasting/test_with_features.csv', index=False)
train.to_csv('../../input/store_sales_time_series_forecasting/train_with_features.csv', index=False)