In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_squared_error,mean_squared_log_error

In [2]:
# path of file
path = ("TRAIN.csv")

df = pd.read_csv(path,parse_dates=['Date']) # Reading File
print(df.shape)
df.head()

(188340, 10)


Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [3]:
df['Weekday'] = df.Date.apply(lambda x:x.weekday())  # Extracting weekday from date

In [4]:
df["Store_id"] = df.Store_id/(365/7) # Scaling Store_id column

In [5]:
df.head(2)

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales,Weekday
0,T1000001,0.019178,S1,L3,R1,2018-01-01,1,Yes,9,7011.84,0
1,T1000002,4.852055,S4,L2,R1,2018-01-01,1,Yes,60,51789.12,0


# Modelling

In [6]:
y = df[['Sales']]  # Target
used_cols = ['Store_id','Store_Type','Location_Type','Region_Code', 'Holiday','Discount','Weekday']  # Predictor
X = df[used_cols]
X = pd.get_dummies(X,drop_first=True)  # Dummification of categorical variables

In [7]:
X_train, X_cv, y_train, y_cv = tts(X,y,test_size=0.3,random_state=1245)  # Train and cross validation split
X_train.shape,y_train.shape,X_cv.shape,y_cv.shape

((131838, 14), (131838, 1), (56502, 14), (56502, 1))

# Gradient Boosting Regressor

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

In [9]:
# making instance of gbr
gbr = GradientBoostingRegressor(n_estimators=183,criterion='mse',
                                max_depth=7,init=None,random_state=12345,max_features=9
                                )

# fitting data in grb
gbr.fit(X_train,y_train.Sales.values)

# Prediction of cross validation data
yhat_cv = gbr.predict(X_cv)

# mean square log error
msle = mean_squared_log_error(y_cv,yhat_cv)
msle

0.08890447465592752

# Prediction for Test Data

In [10]:
# Training Gradient Boosting Regressor on all data
gbr = GradientBoostingRegressor(n_estimators=183,criterion='mse',
                                max_depth=7,init=None,random_state=12345,max_features=9
                                )
gbr.fit(X=X,y=y.Sales.values)

yhat = gbr.predict(X)
msle = mean_squared_log_error(y_true=y,y_pred=yhat)
msle

0.08786736935282981

In [11]:
# Loading test data
test_df= pd.read_csv('TEST_FINAL.csv',parse_dates=['Date'])
test_df['Store_id'] = test_df.Store_id/(365/7)  # Scaling Store_id column

test_id = test_df.ID # Storing ID column for later use

test_df['Weekday'] = test_df.Date.apply(lambda x:x.weekday()) # Weekday calculation for test data

test_df = test_df[['Store_id','Store_Type','Location_Type','Region_Code',
                   'Holiday','Discount','Weekday']]  # Predictors

test_df = pd.get_dummies(test_df,drop_first=True) # Dummification of categorical variables

In [12]:
prediction_on_test = gbr.predict(test_df).flatten()  # Prediction for test data
prediction_on_test[:10]

array([55165.10958051, 39598.32207568, 78538.45352586, 38033.82824695,
       40935.81117574, 96321.31536702, 64277.17062325, 50519.95120478,
       54079.07457269, 35296.40171342])

In [13]:
data = np.array([test_id.values,prediction_on_test]).T # Storing ID and predictions in numpy array
data.shape

(22265, 2)

In [14]:
gbr_reg = pd.DataFrame(data=data,columns=['ID','Sales']) # Storing ID and predictions in data frame

In [15]:
len(gbr_reg.Sales.unique())*100/gbr_reg.shape[0]  # Checking percentage of unique predictions

26.135189759712553

In [16]:
gbr_reg.to_csv('gbr_reg_final.csv',index=False) # Storing ID and predictions in csv file for submission