# Data Model -  Regression

## Initial Setup for XGBoost

In [1]:
#installation for xgboost
!pip install xgboost 



## Import Libraries

In [2]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Loading dataset 

In [3]:
#Loading Data
train_final = pd.read_csv (r'D:\Jupyter\Merged\train_final.csv')  

train_final.Date = pd.to_datetime(train_final.Date,format='%Y-%m-%d')
train_final.index = train_final.Date
train_final = train_final.drop('Date', axis=1)
train_final.head()

Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Type,Size,Year,Week
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-02-05,1,1,24924.5,0,3,151315,2010,5
2010-02-05,1,2,50605.27,0,3,151315,2010,5
2010-02-05,1,3,13740.12,0,3,151315,2010,5
2010-02-05,1,4,39954.04,0,3,151315,2010,5
2010-02-05,1,5,32229.38,0,3,151315,2010,5


## Setting Training Set 

In [4]:
#Setting training set
x=train_final[['Store','Dept','IsHoliday','Type','Size','Year','Week']] #Independent Variables
y=train_final['Weekly_Sales'] # Dependent variable

## Train Test Split

In [5]:
#train-test split
X_train,X_test,y_train,y_test=train_test_split( x, y, test_size=0.30, random_state=0)

In [6]:
#Applying Standard Scaler so values becomes -1 to 1
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Decision Tree Modeling

In [7]:
#Decision tree model
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)

## Random Forest Regressor Modeling

In [8]:
#Random Forest Regressor modeling
rfr = RandomForestRegressor()        
rfr.fit(X_train,y_train)
y_pred_rfr=rfr.predict(X_test)

## XGBoost Modeling

In [9]:
#XGBoost Modeling
xgb_clf = xgb.XGBRegressor(nthread= 4, n_estimators= 500, max_depth= 6, learning_rate= 0.5) 
xb = xgb_clf.fit(X_train,y_train)
y_pred_xgb=xgb_clf.predict(X_test)

## Regression Model Comparison

In [10]:
#Comparison of the three model with MSE, MAE, and RMSE
print('MSE_DT', mean_squared_error(y_test,y_pred_dt))
print('MSE_RFR', mean_squared_error(y_test,y_pred_rfr))
print('MSE_XGB', mean_squared_error(y_test,y_pred_xgb))
print(" ")
print('MAE_DT', mean_absolute_error(y_test,y_pred_dt))
print('MAE_RFR', mean_absolute_error(y_test,y_pred_rfr))
print('MAE_XGB', mean_absolute_error(y_test,y_pred_xgb))
print(" ")
print('RMSE_DT', math.sqrt(mean_squared_error(y_test,y_pred_dt)))
print('RMSE_RFR', math.sqrt(mean_squared_error(y_test,y_pred_rfr)))
print('RMSE_XGB', math.sqrt(mean_squared_error(y_test,y_pred_xgb)))

MSE_DT 21696159.527696967
MSE_RFR 12115351.791808713
MSE_XGB 10625660.654117089
 
MAE_DT 1682.7541789817433
MAE_RFR 1356.0245261933567
MAE_XGB 1689.265133534543
 
RMSE_DT 4657.9136453670935
RMSE_RFR 3480.711391627969
RMSE_XGB 3259.7025407415763
