In [1]:
#Call data manipulation libraries
import numpy as np
import pandas as pd
import seaborn as sns
#Graphing
import matplotlib.pyplot as plt 

#Data transformation classes
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#Dimensionality reduction
from sklearn.decomposition import PCA 

#Data splitting and model parameter search
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from xgboost.sklearn import XGBRegressor
#Model pipelining
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#Random forest modelling
from sklearn.ensemble import RandomForestRegressor
import time
import gc
from scipy.stats import uniform
import calendar

#Display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#To read data of train and store file
ross_train = pd.read_csv("train.csv",low_memory=False)
ross_store=store = pd.read_csv("store.csv")
ross_train.head()
ross_store.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [3]:
#To check for null value in the datasets
ross_train.isnull().sum()
ross_store.isnull().sum()

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [4]:
#Merging train and store file
data = pd.merge(ross_train, ross_store,on = 'Store', how='left')
data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9.0,2009.0,0,,,
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4.0,2015.0,0,,,


In [5]:
data.shape


(1017209, 18)

In [6]:
#Considering only the records have Sales > 0 since the dataset is already too large
#data = data[data['Sales'] > 0]

data.dropna(inplace = True)

In [7]:
#to check the reduction in the dataset
data.shape

(324326, 18)

In [8]:
def checkpromomonth(row):
 if (row['MonthName'] in row['PromoInterval']):
    return 1
 else:
    return 0

In [9]:
def ProcessData(data):
    data["CompetitionDistance"].fillna(data["CompetitionDistance"].mean(), inplace = True)
    
    data['StateHoliday']= data['StateHoliday'].map({'0':0, 0: 0,'a':1, 'b' : 2, 'c': 3})
    
    data['Date']=pd.to_datetime(data['Date'])
    data['Year']=data['Date'].dt.year
    data['MonthNumber']=data['Date'].dt.month
    data['MonthName']=data['MonthNumber'].apply(lambda x: calendar.month_abbr[x])
    data['Day']=data['Date'].dt.day
    data['WeekNumber']=data['Date'].dt.weekofyear

    data['CompetitionOpen'] = 12 * (data['Year'] - data['CompetitionOpenSinceYear']) + (data['MonthNumber'] - data['CompetitionOpenSinceMonth'])
    data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)

    data['Promo2Open'] = 12 * (data['Year'] - data['Promo2SinceYear']) + (data['WeekNumber'] - data['Promo2SinceWeek']) / float(4)
    data['Promo2Open'] = data['Promo2Open'].apply(lambda x: x if x > 0 else 0)

    data['PromoInterval']=data['PromoInterval'].astype(str)
    
    data['IsPromoMonth'] =  data.apply(lambda row: checkpromomonth(row),axis=1)

    data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis = 1,  inplace = True)
    data.drop(['Promo2SinceYear', 'Promo2SinceWeek'], axis = 1,  inplace = True)
    data.drop(['Date', 'MonthName','PromoInterval'], axis = 1,  inplace = True)


In [10]:
ProcessData(data)

  data['WeekNumber']=data['Date'].dt.weekofyear


In [13]:
#To check for null value in the dataset
data.isnull().sum()
data.shape
data.head()

Store                  0
DayOfWeek              0
Sales                  0
Customers              0
Open                   0
Promo                  0
StateHoliday           0
SchoolHoliday          0
StoreType              0
Assortment             0
CompetitionDistance    0
Promo2                 0
Year                   0
MonthNumber            0
Day                    0
WeekNumber             0
CompetitionOpen        0
Promo2Open             0
IsPromoMonth           0
dtype: int64

(324326, 19)

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,MonthNumber,Day,WeekNumber,CompetitionOpen,Promo2Open,IsPromoMonth
1,2,5,6064,625,1,1,0,1,a,a,570.0,1,2015,7,31,31,92.0,64.5,1
2,3,5,8314,821,1,1,0,1,a,a,14130.0,1,2015,7,31,31,103.0,52.25,1
10,11,5,10457,1236,1,1,0,1,a,c,960.0,1,2015,7,31,31,44.0,43.5,1
13,14,5,6544,710,1,1,0,1,a,a,1300.0,1,2015,7,31,31,16.0,45.75,1
14,15,5,9191,766,1,1,0,1,d,c,4110.0,1,2015,7,31,31,64.0,52.25,1


In [14]:
data['StoreType'].value_counts()
data['Assortment'].value_counts()
data['StoreType']= data['StoreType'].map({'a':1, 'b' : 2, 'c': 3, 'd' : 4})
data['Assortment'] = data['Assortment'].map({'a':1, 'b' : 2, 'c': 3})

a    177912
d    103796
c     42618
Name: StoreType, dtype: int64

a    189304
c    135022
Name: Assortment, dtype: int64

In [15]:
data = data.astype('int32')

In [16]:
#Start of regression problem preparation
y = data['Sales']
data.drop(['Sales','Customers'], axis = 1,  inplace = True)

In [17]:
num_columns = data.columns[data.nunique() > 12]
cat_columns = data.columns[data.nunique() <= 12]
num_columns
cat_columns

Index(['Store', 'CompetitionDistance', 'Day', 'WeekNumber', 'CompetitionOpen',
       'Promo2Open'],
      dtype='object')

Index(['DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'Promo2', 'Year', 'MonthNumber',
       'IsPromoMonth'],
      dtype='object')

In [18]:
ct=ColumnTransformer([
    ('rs',RobustScaler(),num_columns),
    ('ohe',OneHotEncoder(),cat_columns),
    ],
    remainder="passthrough"
    )
ct.fit_transform(data)

array([[-0.98076923, -0.33001988,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.97902098,  2.36580517,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.96503497, -0.25248509,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.95104895, -0.16500994, -1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.95454545,  0.25049702, -1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.95804196, -0.06560636, -1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [19]:
X=data

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30)
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(227028, 17)

(97298, 17)

(227028,)

(97298,)

In [21]:
steps_xg = [('sts', StandardScaler() ),
            ('pca', PCA()),
            ('xg',  XGBRegressor(objective='reg:squarederror',silent = False, n_jobs=3, reg_lambda=1,gamma=0))
            ]

pipe_xg = Pipeline(steps_xg)

pipe_xg.get_params()

{'memory': None,
 'steps': [('sts', StandardScaler()),
  ('pca', PCA()),
  ('xg',
   XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
                colsample_bynode=None, colsample_bytree=None, gamma=0, gpu_id=None,
                importance_type='gain', interaction_constraints=None,
                learning_rate=None, max_delta_step=None, max_depth=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=3, num_parallel_tree=None,
                random_state=None, reg_alpha=None, reg_lambda=1,
                scale_pos_weight=None, silent=False, subsample=None,
                tree_method=None, validate_parameters=None, verbosity=None))],
 'verbose': False,
 'sts': StandardScaler(),
 'pca': PCA(),
 'xg': XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=0, gpu_id=None,
              importance_type='gain', inter

In [22]:
#defining root mean square percentage error for evaluation
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def RMSPE(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe 

In [23]:
#Randomized Search
parameters = {'xg__learning_rate':  uniform(0, 1),
              'xg__n_estimators':   range(50,300),
              'xg__max_depth':      range(3,10),
              'pca__n_components' : range(10,17)}

rs = RandomizedSearchCV(pipe_xg,
                        param_distributions=parameters,
                        #scoring=make_scorer(mean_squared_error, squared=False),
                        #scoring= RMSPE,
                        n_iter=15,    
                        verbose = 1,
                        #refit = RMSPE,
                        n_jobs = 3,
                        cv = 3              
                        )

In [24]:
start = time.time()
rs.fit(X_train, y_train)
end = time.time()
(end - start)/60 

Fitting 3 folds for each of 15 candidates, totalling 45 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('sts', StandardScaler()),
                                             ('pca', PCA()),
                                             ('xg',
                                              XGBRegressor(base_score=None,
                                                           booster=None,
                                                           colsample_bylevel=None,
                                                           colsample_bynode=None,
                                                           colsample_bytree=None,
                                                           gamma=0, gpu_id=None,
                                                           importance_type='gain',
                                                           interaction_constraints=None,
                                                           learning_rate=None,
                                                           max

38.6602469642957

In [25]:
# Model with parameters of random search
model_rs = XGBRegressor(objective='reg:squarederror',silent = False, n_jobs=3, reg_lambda=1,gamma=0,
                    learning_rate = rs.best_params_['xg__learning_rate'],
                    max_depth = rs.best_params_['xg__max_depth'],
                    n_estimators=rs.best_params_['xg__max_depth']
                    )


model_rs.fit(X_train, y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1786622755196361, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=8, n_jobs=3, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=False,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [26]:
y_pred_rs = model_rs.predict(X_test)
RMSPE(y_test,y_pred_rs)

rs.best_score_

0.2952754304041464

0.8717232759373986

In [27]:
import math
accuracy_rs =  math.sqrt(sum((y_test - y_pred_rs)**2)/y_test.count())
print("Accuracy with Random search XGB model:",accuracy_rs*100)

Accuracy with Random search XGB model: 218473.69809623883


In [28]:
X_test_df = X_test.reset_index()
y_test_df = y_test.reset_index()
y_pred_df  = pd.DataFrame(y_pred_rs)

final = X_test_df
#final output
final = final.merge(y_test_df, left_index=True, right_index=True)
final = final.merge(y_pred_df, left_index=True, right_index=True)
final

Unnamed: 0,index_x,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,...,Year,MonthNumber,Day,WeekNumber,CompetitionOpen,Promo2Open,IsPromoMonth,index_y,Sales,0
0,548499,705,2,1,0,0,0,1,1,4140,...,2014,2,25,9,17,33,1,548499,5533,3860.925537
1,114481,752,1,1,0,0,0,1,1,970,...,2015,4,20,17,25,20,0,114481,3994,3533.118164
2,954307,653,2,1,0,0,0,4,3,7520,...,2013,2,26,9,0,39,1,954307,4454,4350.058594
3,282211,18,3,1,1,0,0,4,3,13840,...,2014,11,12,46,53,32,0,282211,6981,5549.661133
4,688519,235,2,1,1,0,1,1,1,5710,...,2013,10,22,43,19,49,1,688519,6377,5601.420410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97293,769828,149,6,1,0,0,0,4,1,2610,...,2013,8,10,32,85,28,0,769828,3987,3710.249023
97294,755936,752,5,1,0,0,1,1,1,970,...,2013,8,23,34,5,0,1,755936,3192,4050.667480
97295,246572,1006,7,0,0,0,0,3,3,3890,...,2014,12,21,51,97,23,0,246572,0,0.103553
97296,480037,258,7,0,0,0,0,1,1,27190,...,2014,4,27,17,45,55,1,480037,0,0.103553
