# Data Preprocessing


In [3]:

import warnings
warnings.filterwarnings("ignore")

# loading packages
# basic + dates 
import numpy as np
import pandas as pd
from pandas import datetime

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline

# time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import json
import os
import logging

import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

In [4]:
sns.set()
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [5]:
train_data= pd.read_csv("D:/work/week3/train.csv")
train_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [6]:
test_data= pd.read_csv("D:/work/week3/test.csv")
test_data.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


In [7]:
store_data = pd.read_csv("D:/work/week3/store.csv")
store_data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [8]:
# check missing values
store_data.isnull().sum()

Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [9]:
# train data
train_data.head(5)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [10]:
# check missing values for train data
train_data.isnull().sum()

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

In [13]:
train_store=pd.merge(train_data,store_data,how='inner',on='Store')
train_store.fillna(0,inplace=True)
train_store.sample()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
884867,971,2,2015-01-13,9359,1015,1,1,0,0,c,a,1140.0,5.0,2011.0,1,14.0,2012.0,"Mar,Jun,Sept,Dec"


In [14]:
train_store['Date']=pd.to_datetime(train_store['Date'])
train_store=train_store.sort_values('Date')
train_store['Day']=train_store['Date'].dt.day
train_store['Month']=train_store['Date'].dt.month
train_store['Year']=train_store['Date'].dt.year
train_store['WeekOfYear'] = train_store['Date'].dt.weekofyear

# Machine Learning

Preprocessing

In [15]:
#drop Date and Open columns
#drop rows where Sales=0
train_store = train_store.drop(['Date','Open','PromoInterval'],axis=1)
train_store = train_store.loc[~(train_store['Sales'] == 0)]

In [16]:
train_store['DayOfWeek']=train_store['DayOfWeek'].apply(str)
train_store['Year']=train_store['Year'].apply(str)
train_store['Promo']=train_store['Promo'].apply(str)
train_store['Promo2']=train_store['Promo2'].apply(str)
train_store['SchoolHoliday']=train_store['SchoolHoliday'].apply(str)
train_store['StateHoliday']=train_store['StateHoliday'].apply(str)
train_store['WeekOfYear']=train_store['WeekOfYear'].apply(str)

In [17]:

train_store.isnull().sum()

Store                        0
DayOfWeek                    0
Sales                        0
Customers                    0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
Day                          0
Month                        0
Year                         0
WeekOfYear                   0
dtype: int64

In [18]:

train_store.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844338 entries, 621467 to 0
Data columns (total 19 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      844338 non-null  int64  
 1   DayOfWeek                  844338 non-null  object 
 2   Sales                      844338 non-null  int64  
 3   Customers                  844338 non-null  int64  
 4   Promo                      844338 non-null  object 
 5   StateHoliday               844338 non-null  object 
 6   SchoolHoliday              844338 non-null  object 
 7   StoreType                  844338 non-null  object 
 8   Assortment                 844338 non-null  object 
 9   CompetitionDistance        844338 non-null  float64
 10  CompetitionOpenSinceMonth  844338 non-null  float64
 11  CompetitionOpenSinceYear   844338 non-null  float64
 12  Promo2                     844338 non-null  object 
 13  Promo2SinceWeek            84

In [19]:
train,test = train_test_split(train_store,test_size=0.2)
train,val = train_test_split(train,test_size=0.2)

print('Train set:',len(train))
print('Validation set:',len(val))
print('Test set:',len(test))

Train set: 540376
Validation set: 135094
Test set: 168868


In [20]:
cat_cols = ['StateHoliday','SchoolHoliday','DayOfWeek','Promo','Promo2','Assortment','StoreType','Year','WeekOfYear']
num_cols = [i for i in train.columns if i not in cat_cols]

num_transformer = Pipeline(steps = [('imp', IterativeImputer(initial_strategy='median')),
                                    ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps = [('imp', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('cat',cat_transformer,cat_cols)])

# 
Random Forest Regression

In [21]:
rand_forest=Pipeline(steps=[('preprocessor',preprocessor),('random_forest', RandomForestRegressor(max_depth=10,random_state=2))],verbose=True)
rand_forest.fit(train,train['Sales'].values)
#predictions for validation data
rand_pred=rand_forest.predict(val)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total= 3.4min
[Pipeline] ..... (step 2 of 2) Processing random_forest, total= 5.7min


# 
Decision Tree Regression

In [22]:
dtree=Pipeline(steps=[('preprocessor',preprocessor),('decision_tree',DecisionTreeRegressor(splitter='random', max_depth=10, random_state=2))],verbose=True)
dtree.fit(train,train['Sales'].values)
#predictions for validation data
dtree_pred=dtree.predict(val)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total= 3.2min
[Pipeline] ..... (step 2 of 2) Processing decision_tree, total=   4.0s


# 
SGD Regression

In [23]:
sgd=Pipeline(steps=[('preprocessor',preprocessor),('sdg_reg',SGDRegressor(eta0=0.1,fit_intercept=False,shuffle=False,learning_rate='adaptive',random_state=2))],verbose=True)
sgd.fit(train,train['Sales'].values)
#predictions for validation data
sgd_pred=sgd.predict(val)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total= 3.1min
[Pipeline] ........... (step 2 of 2) Processing sdg_reg, total=  11.2s


# 
Serialising

In [24]:
models=[rand_forest,dtree,sgd]
with open("../pickle/30-07-2021-20-51-03-00.pkl.", "wb") as f:
    for model in models:
         pickle.dump(model, f)

FileNotFoundError: [Errno 2] No such file or directory: '../pickle/30-07-2021-20-51-03-00.pkl.'