### Importing Important Libraries

Steps To Be Followed
1. Importing necessary Libraries
2. Creating S3 bucket
3. Mapping train And Test Data in S3
4. Mapping The path of the models in S3

In [1]:
import sagemaker  # to use existing alogorithm like XGBoost : downlading an imagecontainer which has the XGBoost from get_image_uri library
import boto3  # Using this we can even read S3 buckets which are public from our local enviornment from python.
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session # if we want to use this instance w.r.t sagemaker we have to create the Session


In [8]:
bucket_name = 'bankappmlproject'  # for creating s3 bucket with the help of code so that it becomes an automated process
my_region = boto3.session.Session().region_name
print(my_region)

us-east-2


In [9]:
s3 = boto3.resource('s3')
try:
    if (my_region=='us-east-2'):
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': 'us-east-2'})
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error', e)

S3 bucket created successfully


In [10]:
# set an output path where the trained model will be saved, helps in versioning and refering back.
# on retraining the model will be saved 
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://bankappmlproject/xgboost-as-a-built-in-algo/output


#### Downloading the Dataset and Storing in S3

In [11]:
# get dataset from urllib an rename as bank_clean.csv
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv",'bank_clean.csv')
    print('Success : downloaded bank_clean.csv')
except Exception as e:
    print('Data load error: ', e)
    
try:
    model_data = pd.read_csv('./bank_clean.csv', index_col=0)
    print('Success : Data Loaded into DataFrame.')
except Exception as e:
    print('Data load error: ',e)    
    
# will appear in the notebook instance folder once data is loaded.

Success : downloaded bank_clean.csv
Success : Data Loaded into DataFrame.


In [12]:
### Train Test Split

import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7*len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [14]:
# SageMaker requires the format that the dependent variable needs to be places before all other columns in the dataset.

# Saving Train and Test into Buckets
# Train Data into Buckets
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no','y_yes'], axis =1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name,prefix), content_type='csv')

In [15]:
# Saving Test Data into Buckets
import os
pd.concat([test_data['y_yes'], test_data.drop(['y_no','y_yes'], axis =1)], axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name,prefix), content_type='csv')

### Building and Training Models - XGBoost Inbuilt Algorithm

In [29]:
# Models are present in the form of container or images in the sage maker. 
# We have to pull it using library get_image_uri in our instance.
from sagemaker.amazon.amazon_estimator import get_image_uri

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name,  version = '1.0-1')


In [30]:
# initialize hyperparameters
# Do not do hyper paarmeter tuning in sagemaker as you may get charged. Hence Krish has done it on his local and got the values.
hyperparameters = {
    'max-depth':"5",
    'eta':'0.2',
    'gamma':'4',
    'min_child_weight':'6',
    'subsample':'0.7',
    'objective':'binary:logistic'}

In [27]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                        role=sagemaker.get_execution_role(),
                                        instance_count=1,
                                        instance_type='ml.m5.2xlarge',
                                        volume_size=5,
                                        max_run=300,
                                        output_path=output_path,
                                        hyperparameters=hyperparameters,
                                        use_spot_instances=True,
                                        max_wait=600)

In [28]:
# Pass the train and test data paths to estimator
estimator.fit({'train':s3_input_train, 'validation':s3_input_test})

2021-08-07 22:19:18 Starting - Starting the training job...
2021-08-07 22:19:41 Starting - Launching requested ML instancesProfilerReport-1628374758: InProgress
...
2021-08-07 22:20:07 Starting - Preparing the instances for training............
2021-08-07 22:22:04 Downloading - Downloading input data
2021-08-07 22:22:04 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mERROR:sagemaker-containers:Reporting training FAILURE[0m
[34mERROR:sagemaker-containers:framework error: [0m
[34mTraceback (most recent call last):
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_container

UnexpectedStatusException: Error for Training job sagemaker-xgboost-2021-08-07-22-19-18-486: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_xgboost_container/training.py", line 94, in main
    train(framework.training_env())
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_xgboost_container/training.py", line 90, in train
    run_algorithm_mode()
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_xgboost_container/training.py", line 68, in run_algorithm_mode
    checkpoint_config=checkpoint_config
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 110, in sagemaker_train
    validated_train_config = hyperparameters.validate(train_config)
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_algorithm_toolkit/hyperparameter_validation.py", line 270, in validate
    raise exc.UserError("Missing required hyperparameter: {}".format(hp)

### Deploying the Machine Learning Model

In [None]:
# Evreytime on training any Machine Learning algorithm new model files will be created and stored in the output folder of the S3 Bucket.
# Hence model versioning is very important as whenever new data comes you retrain the model and save as per the new timestamp.
# Model is saved in the form of zip files.
xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')  # Warning : This can cause billing.

### Deploy Machine Learning Model as Endpoints

### Prediction of the Test Data

In [None]:
from sagemaker.predictor import csv_serializer
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

In [None]:
predictions_array

In [None]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

### Deleting The Endpoints

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()