# TASK #1: UNDERSTAND THE PROBLEM STATEMENT/GOAL



- This dataset contains weekly sales from 99 departments belonging to 45 different stores. 
- Our aim is to forecast weekly sales from a particular department.
- The objective of this case study is to forecast weekly retail store sales based on historical data.
- The data contains holidays and promotional markdowns offered by various stores and several departments throughout the year.
- Markdowns are crucial to promote sales especially before key events such as Super Bowl, Christmas and Thanksgiving. 
- Developing accurate model will enable make informed decisions and make recommendations to improve business processes in the future. 
- The data consists of three sheets: 
    - Stores
    - Features
    - Sales
- Data Source : https://www.kaggle.com/manjeetsingh/retaildataset

# TASK #2: IMPORT DATASET AND LIBRARIES

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile


In [4]:
df = pd.read_csv('../data/data_processed.csv')
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,month,Type,Size
0,1,1,2010-05-02,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
1,1,2,2010-05-02,50605.27,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
2,1,3,2010-05-02,13740.12,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
3,1,4,2010-05-02,39954.04,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315
4,1,5,2010-05-02,32229.38,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,5,A,151315


In [5]:
# Drop the date
y = df['Weekly_Sales'] # target
X = df.drop(columns = ['Weekly_Sales', 'Date']) # features

In [6]:
X = pd.get_dummies(X, columns = ['Type', 'Store', 'Dept'], drop_first = True)

In [7]:
df =  pd.concat([y,X], axis=1)
df.head()

Unnamed: 0,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
1,50605.27,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
2,13740.12,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
3,39954.04,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0
4,32229.38,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X.shape

(421570, 138)

In [9]:
y.shape

(421570,)

In [10]:
X.head()

Unnamed: 0,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
1,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
2,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
3,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0
4,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# save columnames for later purposes
featurenames = list(X.columns)

In [12]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [13]:
# reshaping the array from (421570,) to (421570, 1)
y = y.reshape(-1,1)
y.shape

(421570, 1)

In [14]:
# spliting the data into train, validation and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)

In [15]:
X_train.shape

(210785, 138)

In [16]:
X_val.shape

(105393, 138)

In [17]:
X_test.shape

(105392, 138)

# TRAIN XGBOOST USING SAGEMAKER Scriptmode

In [18]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]

  


In [19]:
train_data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,128,129,130,131,132,133,134,135,136,137
0,10802.219727,0.0,69.709999,2.809,0.0,0.0,0.0,0.0,0.0,182.431564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14468.509766,0.0,69.709999,2.809,0.0,0.0,0.0,0.0,0.0,182.431564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10464.530273,0.0,32.360001,3.242,25204.669922,2436.75,167.880005,16066.759766,7289.689941,130.645798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,286.290009,0.0,49.259998,3.348,0.0,0.0,0.0,0.0,0.0,127.719582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,35345.730469,0.0,51.310001,3.112,0.0,0.0,0.0,0.0,0.0,132.15213,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
val_data = pd.DataFrame({'Target':y_val[:,0]})
for i in range(X_val.shape[1]):
    val_data[i] = X_val[:,i]

  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
val_data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,...,128,129,130,131,132,133,134,135,136,137
0,9263.660156,0.0,73.940002,3.667,7582.009766,585.900024,0.0,1019.280029,3522.110107,142.192032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,0.0,28.6,3.638,0.0,0.0,0.0,0.0,0.0,134.068253,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20628.429688,0.0,71.449997,3.881,0.0,0.0,0.0,0.0,0.0,208.153503,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7892.990234,0.0,33.59,3.827,26970.650391,2430.780029,152.580002,28367.779297,3625.669922,137.423889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3152.199951,0.0,55.209999,3.951,11984.620117,0.0,47.52,6150.629883,1775.540039,190.922211,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
val_data.shape

(105393, 139)

In [23]:
test_data = pd.DataFrame({'Target':y_test[:,0]})
for i in range(X_test.shape[1]):
    test_data[i] = X_test[:,i]

  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
# convert df to libsvm data type
from sklearn.datasets import dump_svmlight_file

def df_to_libsvm(df: pd.DataFrame, filepath: str):
    x = df.drop(columns = ['Target'], axis=1)
    y = df['Target']
    dump_svmlight_file(X=x, y=y, f=filepath, zero_based=True)

In [25]:
# save train_data and validation_data as libsvm files compatible for sparkml-lib
df_to_libsvm(train_data, '../data/train')
df_to_libsvm(val_data, '../data/validation')
df_to_libsvm(test_data, '../data/test')

## also save train_data and validation_data as csv files.
#train_data.to_csv('../data/train.csv', header = False, index = False)
#val_data.to_csv('../data/validation.csv', header = False, index = False)

In [26]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3
from sagemaker import Session

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()
#bucket = Session().default_bucket() 
bucket = 'salesprediction-ml-sagemaker'
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

In [27]:
print(role)

arn:aws:iam::103721820087:role/service-role/AmazonSageMaker-ExecutionRole-20190909T202771


In [28]:
# read the data from csv file and then upload the data to s3 bucket
import os
with open('../data/train','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/train/XGBoost-Regressor


In [29]:
# read the data from csv file and then upload the data to s3 bucket
with open('../data/validation','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

uploaded validation data location: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/validation/XGBoost-Regressor


In [30]:
# read the data from csv file and then upload the data to s3 bucket
with open('../data/test','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(f)
# Let's print out the validation data location in s3
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded test data location: {}'.format(s3_test_data))

uploaded test data location: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/test/XGBoost-Regressor


In [31]:
# creates output placeholder in S3 bucket to store the output
output_path = "s3://{}/{}/output".format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_path))

training artifacts will be uploaded to: s3://salesprediction-ml-sagemaker/XGBoost-Regressor/output


In [33]:
hyperparams = {
    "max_depth": "5",
    "n_estimators": "10", # in scriptmode, we can set e.g. n_estimator, which we cannot in the built-in xgb!
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "1",
}

instance_type = "ml.m5.2xlarge"
content_type = "libsvm"
#content_type = "csv"

In [34]:
# Open Source distributed script mode
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

session = Session()
script_path = "xgb_train.py"

xgb_script_mode_estimator = XGBoost(
    entry_point=script_path,
    framework_version="1.5-1",  # Note: framework_version is mandatory
    hyperparameters=hyperparams,
    role=role,
    instance_count=1,
    instance_type=instance_type,
    output_path=output_path,
)


In [35]:
# Creating "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
train_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "train"), content_type=content_type
)

valid_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "validation"), content_type=content_type
)

test_input = TrainingInput(
    "s3://{}/{}/{}/".format(bucket, prefix, "test"), content_type=content_type
)

#train_input = sagemaker.session.s3_input(s3_data = s3_train_data, content_type=content_type,s3_data_type = 'S3Prefix')
#valid_input = sagemaker.session.s3_input(s3_data = s3_validation_data, content_type=content_type,s3_data_type = 'S3Prefix')

In [36]:
train_input.config

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
   'S3Uri': 's3://salesprediction-ml-sagemaker/XGBoost-Regressor/train/',
   'S3DataDistributionType': 'FullyReplicated'}},
 'ContentType': 'libsvm'}

In [37]:
# train the model
data_channels = {'train': train_input,'validation': valid_input}
xgb_script_mode_estimator.fit(data_channels)

2022-11-29 08:51:57 Starting - Starting the training job...
2022-11-29 08:52:13 Starting - Preparing the instances for trainingProfilerReport-1669711917: InProgress
......
2022-11-29 08:53:22 Downloading - Downloading input data...
2022-11-29 08:53:47 Training - Downloading the training image...
2022-11-29 08:54:23 Training - Training image download completed. Training in progress..[34m[2022-11-29 08:54:26.574 ip-10-2-82-147.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-11-29:08:54:26:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-11-29:08:54:26:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-11-29:08:54:26:INFO] Invoking user training script.[0m
[34m[2022-11-29:08:54:27:INFO] Module xgb_train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2022-11-29:08:54:27:INFO] Generating setup.cfg[0m
[34m[2022-11-29:08:54:27:INFO] Generating MANIFEST.in[0m
[34m[2022-11-29:08:54:27:I

# TASK #11: DEPLOY THE MODEL TO MAKE PREDICTIONS

In [38]:
# Deploy the model to perform inference 
predictor = xgb_script_mode_estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m5.2xlarge')

-----!

In [39]:
# test model on libsvm file
from sagemaker.serializers import LibSVMSerializer
predictor.serializer = LibSVMSerializer

In [40]:
# predict on libsvm file
with open('../data/validation', "r") as f:
    payload = f.read()

In [41]:
runtime_client = session.sagemaker_runtime_client
response = runtime_client.invoke_endpoint(
                                            EndpointName=predictor.endpoint_name, 
                                            ContentType="text/libsvm", Body=payload
)

In [42]:
result = response["Body"].read().decode("ascii")
predicted_values = np.fromstring(result[1:-2], dtype=float, sep=',')
print("Predicted values are {}.".format(predicted_values[:10]))

Predicted values are [13945.86523438  7645.62646484 22433.3828125   8799.90429688
  8799.90429688  6478.43505859 12579.07910156  7271.13378906
  8799.90429688 31876.49609375].


In [43]:
predicted_values.shape

(105393,)

In [44]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

k = X_val.shape[1]
n = len(X_val)
RMSE = float(format(np.sqrt(mean_squared_error(y_val, predicted_values)),'.3f'))
MSE = mean_squared_error(y_val, predicted_values)
MAE = mean_absolute_error(y_val, predicted_values)
r2 = r2_score(y_val, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

RMSE = 9646.181 
MSE = 93048813.27250983 
MAE = 6324.915943286014 
R2 = 0.8166946874655691 
Adjusted R2 = 0.8164543532917633


In [45]:
# Delete the end-point
predictor.delete_model()
predictor.delete_endpoint()