# TASK #1: UNDERSTAND THE PROBLEM STATEMENT/GOAL



- This dataset contains weekly sales from 99 departments belonging to 45 different stores. 
- Our aim is to forecast weekly sales from a particular department.
- The objective of this case study is to forecast weekly retail store sales based on historical data.
- The data contains holidays and promotional markdowns offered by various stores and several departments throughout the year.
- Markdowns are crucial to promote sales especially before key events such as Super Bowl, Christmas and Thanksgiving. 
- Developing accurate model will enable make informed decisions and make recommendations to improve business processes in the future. 
- The data consists of three sheets: 
    - Stores
    - Features
    - Sales
- Data Source : https://www.kaggle.com/manjeetsingh/retaildataset

# TASK #2: IMPORT DATASET AND LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile

In [2]:
df = pd.read_csv('../data/data_processed.csv')
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,month,Type,Size
0,1,1,2010-05-02,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
1,1,2,2010-05-02,50605.27,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
2,1,3,2010-05-02,13740.12,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
3,1,4,2010-05-02,39954.04,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
4,1,5,2010-05-02,32229.38,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315


In [3]:
# Drop the date
y = df['Weekly_Sales'] # target
X = df.drop(columns = ['Weekly_Sales', 'Date']) # features

In [4]:
# hot-encode categorical variables
X = pd.get_dummies(X, columns = ['Type', 'Store', 'Dept'], drop_first = True)

In [6]:
X.shape

(421570, 138)

In [7]:
y.shape

(421570,)

In [8]:
X

Unnamed: 0,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,0,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
1,0,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
2,0,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
3,0,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
4,0,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,0,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,...,0,0,0,1,0,0,0,0,0,0
421566,0,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,...,0,0,0,0,1,0,0,0,0,0
421567,0,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,...,0,0,0,0,0,1,0,0,0,0
421568,0,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,...,0,0,0,0,0,0,0,1,0,0


In [9]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [10]:
# reshaping the array from (421570,) to (421570, 1)
y = y.reshape(-1,1)
y.shape

(421570, 1)

In [11]:
# scaling the data before feeding the model
# from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scaler_x = StandardScaler()
# X = scaler_x.fit_transform(X)

# scaler_y = StandardScaler()
# y = scaler_y.fit_transform(y)

In [12]:
# spliting the data in to train, validation and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)

In [13]:
X_train.shape

(210785, 138)

In [14]:
X_val.shape

(105393, 138)

In [15]:
X_test.shape

(105392, 138)

# TASK #9: TRAIN XGBOOST REGRESSOR IN LOCAL MODE

In [16]:
!pip install xgboost

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [17]:
%%time
# Train an XGBoost regressor model 
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror', 
                         learning_rate = 0.1, 
                         max_depth = 2,
                         n_estimators = 10)
model.fit(X_train, y_train)

CPU times: user 9.75 s, sys: 321 ms, total: 10.1 s
Wall time: 5.27 s


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=2, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=0,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=0, reg_alpha=0, ...)

In [18]:
type(model)

xgboost.sklearn.XGBRegressor

In [19]:
# predict the score of the trained model using the testing dataset
result = model.score(X_test, y_test)
print("R2 on test set : {}".format(result))

R2 on test set : 0.18942426009250504


In [20]:
# make predictions on the test data
y_predict = model.predict(X_test)
y_predict

array([25876.209 , 10775.623 ,  8114.9688, ..., 10775.623 ,  8114.9688,
        8916.834 ], dtype=float32)

In [21]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

RMSE = 20375.156 
MSE = 415146980.0 
MAE = 12227.866 
R2 = 0.18942426009250501 
Adjusted R2 = 0.1883614927404368
