In [2]:
from IPython.display import display, Image
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.serializers import CSVSerializer
from time import gmtime, strftime
import boto3, re, sys, math, json, os, sagemaker, urllib.request
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
# AWS Sagemaker deployment
# Role definition
role = get_execution_role()
prefix = "sage-boost"
my_region = boto3.session.Session().region_name

In [4]:
XGB_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

In [5]:
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + XGB_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [12]:
bucket_name = "sage-goose"
s3 = boto3.resource("s3")

In [13]:
# The creation of the buckets.
try:
    if my_region == "us-east-1":
        s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint':my_region})
    print("Successful Bucket Creation")
except Exception as e:
    print("S3 error!", e)

Successful Bucket Creation


In [14]:
# Obtaining the data
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print("Success!")
except Exception as e:
    print("Error.", e)

Success!


In [15]:
try:
    df = pd.read_csv("./bank_clean.csv", index_col=0)
    print("Success!")
except Exception as e:
    print("Error.",e)

Success!


In [16]:
train_data, test_data = np.split(df.sample(frac=1, random_state=1729), [int(0.7 * len(df))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [17]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
trains = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [18]:
sess = sagemaker.Session()
XGB = sagemaker.estimator.Estimator(XGB_container, role, instance_count=1, instance_type='ml.m4.xlarge', output_path='s3://{}/{}/output'.format(bucket_name, prefix), sagemaker_session=sess)
XGB.set_hyperparameters(max_depth=6, eta=0.2, gamma=4, min_child_weight=6, subsample=0.8, silent=0, objective='binary:logistic', num_round=100)

In [19]:
XGB.fit({'train':trains})

2021-06-09 16:11:15 Starting - Starting the training job...
2021-06-09 16:11:43 Starting - Launching requested ML instancesProfilerReport-1623255074: InProgress
......
2021-06-09 16:12:43 Starting - Preparing the instances for training.........
2021-06-09 16:14:04 Downloading - Downloading input data...
2021-06-09 16:14:44 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-06-09:16:14:54:INFO] Running standalone xgboost training.[0m
[34m[2021-06-09:16:14:54:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2021-06-09:16:14:54:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8419.96mb[0m
[34m[2021-06-09:16:14:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:14:54] S3DistributionType set as FullyReplicated[0m
[34m[16:14:54] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[16:14:54] src/tree/updater_prune.cc:

In [22]:
XGB_predictor = XGB.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------!

In [23]:
test_array = test_data.drop(columns=['y_yes','y_no'], axis=1).values
XGB_predictor.serializer = CSVSerializer()
predictions = XGB_predictor.predict(test_array).decode("utf-8")
predictions_array = np.fromstring(predictions[1:], sep=",")
print(predictions_array.shape)

(12357,)


In [31]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.4%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10757)    38% (179)
Purchase        10% (1131)     62% (290) 



In [32]:
# Closing time.
XGB_predictor.delete_endpoint(delete_endpoint_config=True)
bucket_to_delete = boto3.resource("s3").Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'C08G6T477QWE45QV',
   'HostId': 'n4R/r0Zs/1TaBinKIqSX6dLV8Tv+Q873W1kWAYok6k3bW8N5ymXP+6iWgrfqQldzV1NpYgiQ7JI=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'n4R/r0Zs/1TaBinKIqSX6dLV8Tv+Q873W1kWAYok6k3bW8N5ymXP+6iWgrfqQldzV1NpYgiQ7JI=',
    'x-amz-request-id': 'C08G6T477QWE45QV',
    'date': 'Wed, 09 Jun 2021 16:36:12 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sage-boost/output/xgboost-2021-06-09-16-11-14-949/rule-output/ProfilerReport-1623255074/profiler-output/profiler-reports/GPUMemoryIncrease.json'},
   {'Key': 'sage-boost/output/xgboost-2021-06-09-16-11-14-949/profiler-output/system/incremental/2021060916/1623255240.algo-1.json'},
   {'Key': 'sage-boost/output/xgboost-2021-06-09-16-11-14-949/rule-output/ProfilerReport-1623255074/profiler-output/profiler-reports/Dataloader.json'},
   {'Key': 'sa