In [2]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from time import gmtime, strftime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image, display
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

sess = sagemaker.Session()
bucket_name = sess.default_bucket()

role = get_execution_role()
prefix = "sagemaker/tutorial"
my_region = boto3.session.Session().region_name

xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print(f"Success - the MySageMakerInstance is in the {my_region} region. You will use the {xgboost_container} container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the ap-northeast-1 region. You will use the 501404015308.dkr.ecr.ap-northeast-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [3]:
urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")

model_data = pd.read_csv("./bank_clean.csv", index_col=0)
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])

print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [4]:
pd.concat([train_data["y_yes"], train_data.drop(columns=["y_no", "y_yes"])], axis=1).to_csv("train.csv", index=False, header=False)
boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(prefix, "train/train.csv")).upload_file("train.csv")
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=f"s3://{bucket_name}/{prefix}/train", content_type="csv")

In [None]:
xgb = sagemaker.estimator.Estimator(xgboost_container, role, instance_count=1, instance_type="ml.m5.xlarge", max_run=3600, use_spot_instances=True,  max_wait=3600, output_path=f"s3://{bucket_name}/{prefix}/output", sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.8, silent=0, objective="binary:logistic", num_round=100)

xgb.fit({"train": s3_input_train})

In [None]:
from sagemaker.serverless import ServerlessInferenceConfig

serverless_config = ServerlessInferenceConfig(memory_size_in_mb=1024, max_concurrency=5)
xgb_predictor = xgb.deploy(serverless_inference_config=serverless_config)

In [7]:
from sagemaker.serializers import CSVSerializer
from sklearn.metrics import classification_report

test_data_array = test_data.drop(columns=["y_no", "y_yes"]).values
xgb_predictor.serializer = CSVSerializer()

predictions = xgb_predictor.predict(test_data_array).decode("utf-8")

predictions_array = np.fromstring(predictions[1:], sep=",")

print(classification_report(test_data["y_yes"], np.round(predictions_array)))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94     10936
           1       0.63      0.20      0.31      1421

    accuracy                           0.89     12357
   macro avg       0.77      0.59      0.63     12357
weighted avg       0.87      0.89      0.87     12357

