In [None]:
%%sh
wget -N https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
unzip -o bank-additional.zip

In [None]:
import sagemaker

print(sagemaker.__version__)

sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'xgboost-direct-marketing'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('./bank-additional/bank-additional-full.csv')

# One-hot encode
data = pd.get_dummies(data)

# Move labels to first column, which is what XGBoost expects
data = data.drop(['y_no'], axis=1)
data = pd.concat([data['y_yes'], data.drop(['y_yes'], axis=1)], axis=1)

# Shuffle and split into training and validation (95%/5%)
data = data.sample(frac=1, random_state=123)
train_data, val_data = train_test_split(data, test_size=0.05)

# Save to CSV files
train_data.to_csv('training.csv', index=False, header=False)
val_data.to_csv('validation.csv', index=False, header=False)

In [None]:
# Use this cell for local mode
training_path = 'file://training.csv'
validation_path = 'file://validation.csv'
output_path   = 'file:///tmp'

train_instance_type = deploy_instance_type = 'local'

In [None]:
# Use this cell for managed mode
training_path = sess.upload_data(path='training.csv', key_prefix=prefix + "/training")
validation_path = sess.upload_data(path="validation.csv", key_prefix=prefix + "/validation")
output_path   = 's3://{}/{}/output/'.format(bucket,prefix)

train_instance_type = 'ml.m5.large'
deploy_instance_type = 'ml.t2.medium'

In [None]:
print(training_path)
print(validation_path)
print(output_path)

In [None]:
from sagemaker import TrainingInput

train_input = TrainingInput(training_path, content_type='text/csv')
val_input = TrainingInput(validation_path, content_type='text/csv')

# Train and deploy on SageMaker

In [None]:
from sagemaker.xgboost import XGBoost

#role = 'arn:aws:iam::0123456789012:role/Sagemaker-fullaccess'

xgb_estimator = XGBoost(
    entry_point='xgb-dm.py', 
    role=sagemaker.get_execution_role(),
    instance_count=1, 
    instance_type=train_instance_type,
    framework_version='1.2-2',
    output_path=output_path,
    hyperparameters={     # Details at https://xgboost.readthedocs.io/en/latest/parameter.html
        'num-round': 100,
        'early-stopping-rounds': 10,
        'max-depth': 5,
        'eval-metric': 'auc'
    })

In [None]:
xgb_estimator.fit({'train':train_input, 'validation':val_input})

In [None]:
from time import strftime,gmtime

xgb_endpoint_name = prefix+strftime('%Y-%m-%d-%H-%M-%S', gmtime())

xgb_predictor = xgb_estimator.deploy(
    endpoint_name=xgb_endpoint_name,
    initial_instance_count=1, 
    instance_type=deploy_instance_type)

print(xgb_endpoint_name)

In [None]:
# Load some samples, drop labels

payload = val_data[:10].drop(['y_yes'], axis=1)
payload = payload.to_csv(header=False, index=False).rstrip('\n')
print(payload)

In [None]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = xgb_predictor.predict(payload)

print(response)

In [None]:
xgb_predictor.delete_endpoint()