# TASK #1: UNDERSTAND THE PROBLEM STATEMENT/GOAL



- This dataset contains weekly sales from 99 departments belonging to 45 different stores. 
- Our aim is to forecast weekly sales from a particular department.
- The objective of this case study is to forecast weekly retail store sales based on historical data.
- The data contains holidays and promotional markdowns offered by various stores and several departments throughout the year.
- Markdowns are crucial to promote sales especially before key events such as Super Bowl, Christmas and Thanksgiving. 
- Developing accurate model will enable make informed decisions and make recommendations to improve business processes in the future. 
- The data consists of three sheets: 
    - Stores
    - Features
    - Sales
- Data Source : https://www.kaggle.com/manjeetsingh/retaildataset

# TASK #2: IMPORT DATASET AND LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile


In [2]:
df = pd.read_csv('../data/data_processed.csv')
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,month,Type,Size
0,1,1,2010-05-02,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
1,1,2,2010-05-02,50605.27,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
2,1,3,2010-05-02,13740.12,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
3,1,4,2010-05-02,39954.04,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
4,1,5,2010-05-02,32229.38,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315


In [3]:
# Drop the date
y = df['Weekly_Sales'] # target
X = df.drop(columns = ['Weekly_Sales', 'Date']) # features

In [4]:
X = pd.get_dummies(X, columns = ['Type', 'Store', 'Dept'], drop_first = True)

In [5]:
df =  pd.concat([y,X], axis=1)
df.head()

Unnamed: 0,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
1,50605.27,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
2,13740.12,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
3,39954.04,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
4,32229.38,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X.shape

(421570, 138)

In [7]:
y.shape

(421570,)

In [8]:
X.head()

Unnamed: 0,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
1,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
2,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
3,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
4,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# save columnames for later purposes
featurenames = list(X.columns)

In [10]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [11]:
# reshaping the array from (421570,) to (421570, 1)
y = y.reshape(-1,1)
y.shape

(421570, 1)

In [12]:
# spliting the data into train, validation and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)

In [13]:
X_train.shape

(210785, 138)

In [14]:
X_val.shape

(105393, 138)

In [15]:
X_test.shape

(105392, 138)

# TASK #10: TRAIN XGBOOST USING SAGEMAKER

In [16]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]

  


In [17]:
train_data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,128,129,130,131,132,133,134,135,136,137
0,3288.580078,0.0,60.990002,3.848,8077.140137,512.0,5.13,2746.23999,3677.780029,214.548553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19197.25,1.0,82.089996,3.709,9082.610352,16.0,59.060001,7217.879883,8026.470215,130.932541,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41284.929688,0.0,64.110001,2.795,0.0,0.0,0.0,0.0,0.0,211.944275,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3523.449951,0.0,68.980003,3.488,0.0,0.0,0.0,0.0,0.0,206.225922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,299.769989,1.0,57.84,3.596,5069.740234,0.0,10.15,817.150024,2158.820068,198.095047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
val_data = pd.DataFrame({'Target':y_val[:,0]})
for i in range(X_val.shape[1]):
    val_data[i] = X_val[:,i]

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
val_data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,128,129,130,131,132,133,134,135,136,137
0,62.369999,0.0,27.030001,3.113,5453.540039,4918.310059,1.88,1838.97998,5303.890137,196.943268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50089.03125,0.0,40.299999,2.938,0.0,0.0,0.0,0.0,0.0,132.870834,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19001.269531,0.0,27.639999,3.08,5185.450195,12090.5,33.900002,1490.060059,4832.680176,130.157516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,19851.390625,0.0,68.720001,3.467,0.0,0.0,0.0,0.0,0.0,219.788574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4171.540039,0.0,68.550003,3.867,11407.219727,15.4,68.639999,3827.679932,2540.330078,215.08783,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
val_data.shape

(105393, 139)

In [21]:
# convert df to libsvm data type
from sklearn.datasets import dump_svmlight_file

def df_to_libsvm(df: pd.DataFrame, filepath: str):
    x = df.drop(columns = ['Target'], axis=1)
    y = df['Target']
    dump_svmlight_file(X=x, y=y, f=filepath, zero_based=True)

In [22]:
# save train_data and validation_data as libsvm files compatible for sparkml-lib
df_to_libsvm(val_data, '../data/train')
df_to_libsvm(val_data, '../data/validation')

## also save train_data and validation_data as csv files.
#train_data.to_csv('../data/train.csv', header = False, index = False)
#val_data.to_csv('../data/validation.csv', header = False, index = False)

In [24]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3
from sagemaker import Session

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()
#bucket = Session().default_bucket() 
bucket = 'salesprediction-ml-sagemaker'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

In [25]:
print(role)

arn:aws:iam::103721820087:role/service-role/AmazonSageMaker-ExecutionRole-20190909T202771


In [26]:
# read the data from csv file and then upload the data to s3 bucket
import os
with open('../data/train','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/train/XGBoost-Regressor


In [27]:
# read the data from csv file and then upload the data to s3 bucket
with open('../data/validation','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

uploaded validation data location: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/validation/XGBoost-Regressor


In [28]:
# creates output placeholder in S3 bucket to store the output
output_path = "s3://{}/{}/output".format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_path))

training artifacts will be uploaded to: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/output


In [29]:
hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "1",
}

instance_type = "ml.m5.2xlarge"
content_type = "libsvm"
#content_type = "csv"

In [31]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

session = Session()
script_path = "xgb_train.py"

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.5-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=role,
    instance_count=1,
    instance_type=instance_type,
    output_path=output_path,
)


In [32]:
# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
train_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "train"), content_type=content_type
)
valid_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "validation"), content_type=content_type
)

#train_input = sagemaker.session.s3_input(s3_data = s3_train_data, content_type=content_type,s3_data_type = 'S3Prefix')
#valid_input = sagemaker.session.s3_input(s3_data = s3_validation_data, content_type=content_type,s3_data_type = 'S3Prefix')

In [33]:
train_input.config

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
   'S3Uri': 's3://salesprediction-ml-sagemaker/XGBoost-Regressor/train/',
   'S3DataDistributionType': 'FullyReplicated'}},
 'ContentType': 'libsvm'}

In [None]:
# train the model
data_channels = {'train': train_input,'validation': valid_input}
xgb_script_mode_estimator.fit(data_channels)

2022-11-28 11:31:50 Starting - Starting the training job...
2022-11-28 11:32:14 Starting - Preparing the instances for trainingProfilerReport-1669635109: InProgress
......
2022-11-28 11:33:14 Downloading - Downloading input data...
2022-11-28 11:33:38 Training - Downloading the training image...
2022-11-28 11:34:14 Training - Training image download completed. Training in progress..[34m[2022-11-28 11:34:11.972 ip-10-2-68-228.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-11-28:11:34:12:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-11-28:11:34:12:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-11-28:11:34:12:INFO] Invoking user training script.[0m
[34m[2022-11-28:11:34:12:INFO] Module xgb_train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2022-11-28:11:34:12:INFO] Generating setup.cfg[0m
[34m[2022-11-28:11:34:12:INFO] Generating MANIFEST.in[0m
[34m[2022-11-28:11:34:12:I

# TASK #11: DEPLOY THE MODEL TO MAKE PREDICTIONS

In [36]:
# Deploy the model to perform inference 
predictor = xgb_script_mode_estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')

------!

In [37]:
'''
Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
in text/csv format, we specify this as content -type.

Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
type

Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html
'''
from sagemaker.predictor import csv_serializer, json_deserializer


Xgboost_regressor.serializer = csv_serializer


NameError: name 'Xgboost_regressor' is not defined

In [None]:
X_test.shape

In [None]:
# making prediction
predictions1 = Xgboost_regressor.predict(X_test[0:10000])

In [None]:
predictions2 = Xgboost_regressor.predict(X_test[10000:20000])

In [None]:
predictions3 = Xgboost_regressor.predict(X_test[20000:30000])

In [None]:
predictions4 = Xgboost_regressor.predict(X_test[30000:31618])

In [None]:
#str(predictions4).split('n')[:-1]

In [None]:
# custom code to convert the values in bytes format to array
def bytes_2_array(x):
    
    # makes entire prediction as string and splits based on ','
    l = str(x).split('n')[:-1]
    
    # Since the first element contains unwanted characters like (b,',') we remove them
    l[0] = l[0][2:]
    #same-thing as above remove the unwanted last character (')
    l[-1] = l[-1][:-1]
    
    # iterating through the list of strings and converting them into float type
    for i in range(len(l)):
        l[i] = float(l[i][:-1])
        
    # converting the list into array
    l = np.array(l).astype('float32')
    
    # reshape one-dimensional array to two-dimensional array
    return l.reshape(-1,1)
    

In [None]:
predicted_values_1 = bytes_2_array(predictions1)
predicted_values_1.shape

In [None]:
predicted_values_2 = bytes_2_array(predictions2)
predicted_values_2.shape

In [None]:
predicted_values_3 = bytes_2_array(predictions3)
predicted_values_3.shape

In [None]:
predicted_values_4 = bytes_2_array(predictions4)
predicted_values_4.shape

In [None]:
predicted_values = np.concatenate((predicted_values_1, predicted_values_2, predicted_values_3, predicted_values_4))

In [None]:
predicted_values.shape

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
# Delete the end-point
Xgboost_regressor.delete_endpoint()

# TASK #12: PERFORM HYPERPARAMETERS OPTIMIZATION

See Slides for detailed steps

# TASK #13: TRAIN THE MODEL WITH BEST PARAMETERS

In [None]:
# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training

Xgboost_regressor = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m5.2xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker_session)

# We can tune the hyper-parameters to improve the performance of the model
Xgboost_regressor.set_hyperparameters(max_depth=25,
                           objective='reg:linear',
                           colsample_bytree = 0.3913546819101119,
                           alpha = 1.0994354985124635,
                           eta = 0.23848185159806115,
                           num_round = 237
                           )


In [None]:
train_input = sagemaker.session.s3_input(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.s3_input(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')
data_channels = {'train': train_input,'validation': valid_input}
Xgboost_regressor.fit(data_channels)

In [None]:
# Deploying the model to perform inference

Xgboost_regressor = Xgboost_regressor.deploy(initial_instance_count = 1,
                                             instance_type = 'ml.m5.2xlarge')

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer
Xgboost_regressor.serializer = csv_serializer

In [None]:
## Try to make inference with the entire testing dataset (Crashes!)
#predictions = Xgboost_regressor.predict(X_test)
#predicted_values = bytes_2_array(predictions)

In [None]:
predictions1 = Xgboost_regressor.predict(X_test[0:10000])

In [None]:
predicted_values_1 = bytes_2_array(predictions1)
predicted_values_1.shape

In [None]:
predictions2 = Xgboost_regressor.predict(X_test[10000:20000])
predicted_values_2 = bytes_2_array(predictions2)
predicted_values_2.shape

In [None]:
predictions3 = Xgboost_regressor.predict(X_test[20000:30000])
predicted_values_3 = bytes_2_array(predictions3)
predicted_values_3.shape

In [None]:
predictions4 = Xgboost_regressor.predict(X_test[30000:31618])
predicted_values_4 = bytes_2_array(predictions4)
predicted_values_4.shape

In [None]:
predicted_values = np.concatenate((predicted_values_1, predicted_values_2, predicted_values_3, predicted_values_4))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

In [None]:
# Delete the end-point
Xgboost_regressor.delete_endpoint()