In [1]:
################################################################################
#
# Creates a supervised learning model trained on bank customer data that
# predicts whether customers will order new certificates of deposits.
#
# Uses an XGBoost library algorithm.
#
# To run, do the following:
#
#         1. Visit https://console.aws.amazon.com/sagemaker .
#         2. Create a notebook instance using the default settings except for
#                    creating a new IAM role.
#         3. When the notebook is ready select "Open Jupyter".
#         4. In the notebook add the conda_python3 component.
#         5. Paste this code into a cell and run it.
#
# To shut down, execute the commented lines at the end.
#
# Code inspired by the tutorial located here:
#
#         https://aws.amazon.com/getting-started/hands-on/build-train-deploy-machine-learning-model-sagemaker/
#
################################################################################

import boto3
import sagemaker
import sagemaker.serializers
import numpy
import pandas
import urllib.request
import os

DATA_URL  = "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
S3_BUCKET = f"unused-bucket-name-{numpy.random.randint(1000000)}"
S3_FOLDER = "demo"
INSTANCE  = "ml.m4.xlarge"

# Saves the region name.

region = boto3.session.Session().region_name

# Creates a container with the necessary XGBoost code.

xgb_container = sagemaker.image_uris.retrieve("xgboost",
                                              region,
                                              "latest")

print(f"CREATED: SageMaker container {xgb_container} in region {region}")

# Creates an S3 bucket.

try:
        boto3.resource("s3").create_bucket(Bucket = S3_BUCKET)
        print(f"CREATED: bucket {S3_BUCKET}")
except:
        print("Error.")

# Downloads the data to a file and creates a corresponding Pandas dataframe.

try:
        urllib.request.urlretrieve (DATA_URL, "bank_clean.csv")
        all_data = pandas.read_csv("./bank_clean.csv", index_col = 0)
        print("CREATED: Pandas dataframe")
except:
        print("Error")

# Shuffles and splits the Pandas dataframe into two Pandas dataframes.

train_data, test_data = numpy.split(all_data.sample(frac = 1),
                                    [int(0.7 * len(all_data))])

print(f"CREATED: NumPy {train_data.shape} and {test_data.shape} arrays")

# Saves a modified version of the training data to a file.

modified = [train_data["y_yes"], train_data.drop(["y_no", "y_yes"], axis = 1)]
modified = pandas.concat(modified, axis = 1)
modified.to_csv("train_data.csv", index  = False, header = False)

# Saves the file to the S3 bucket.

train_path = os.path.join(S3_FOLDER, "train_data.csv")
bucket     = boto3.Session().resource("s3").Bucket(S3_BUCKET)
bucket.Object(train_path).upload_file("train_data.csv")

# Creates an S3 connection.

train_folder = f"s3://{S3_BUCKET}/{os.path.dirname(train_path)}"
s3_conn      = sagemaker.inputs.TrainingInput(s3_data      = train_folder,
                                              content_type = "csv")

print(f"CREATED: S3 connection to the folder {train_folder}")

# Creates a SageMaker training object configured with the following info:
#
#         hyperparameters
#         output folder
#         reference to the SageMaker XGBoost container
#         details of EC2 instances to use

s3_folder = f"s3://{S3_BUCKET}/{S3_FOLDER}"
session   = sagemaker.Session()
train_obj = sagemaker.estimator.Estimator(xgb_container,
                                          sagemaker.get_execution_role(),
                                          instance_count    = 1,
                                          instance_type     = INSTANCE,
                                          output_path       = s3_folder,
                                          sagemaker_session = session)
train_obj.set_hyperparameters(max_depth        = 5,
                              eta              = 0.2,
                              gamma            = 4,
                              min_child_weight = 6,
                              subsample        = 0.8,
                              silent           = 0,
                              objective        = "binary:logistic",
                              num_round        = 100)

print("CREATED: SageMaker training object")

# Does the training.

train_obj.fit({"train": s3_conn})

print("RESULT: Training completed.")

# Creates a model server.

model_server            = train_obj.deploy(initial_instance_count = 1,
                                           instance_type          = INSTANCE)
model_server.serializer = sagemaker.serializers.CSVSerializer()

print("CREATED: model server")

# Does the testing.

test_data_ = test_data.drop(["y_no", "y_yes"], axis = 1).values
results    = model_server.predict(test_data_).decode()
results    = numpy.round(numpy.fromstring(results[1:], sep = ","))
accuracy   = 100 * (results == test_data["y_yes"]).astype(int).mean()
print(f"RESULT: Accuracy is {accuracy:.1f}%.")

# Run these to delete the model server and bucket.

#model_server.delete_endpoint(delete_endpoint_config = True)
#boto3.resource("s3").Bucket(S3_BUCKET).objects.all().delete()


CREATED: SageMaker container 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest in region us-east-1
CREATED: bucket unused-bucket-name-140114
CREATED: Pandas dataframe
CREATED: NumPy (28831, 61) and (12357, 61) arrays
CREATED: S3 connection to the folder s3://unused-bucket-name-140114/demo
CREATED: SageMaker training object
2022-04-07 21:27:40 Starting - Starting the training job...
2022-04-07 21:28:08 Starting - Preparing the instances for trainingProfilerReport-1649366860: InProgress
.........
2022-04-07 21:29:30 Downloading - Downloading input data......
2022-04-07 21:30:24 Training - Downloading the training image...
2022-04-07 21:31:07 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-04-07:21:31:10:INFO] Running standalone xgboost training.[0m
[34m[2022-04-07:21:31:10:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-04-07:21:31:10:INFO] File size need to be processed in the node: 3.38mb. Avai