In [30]:
!pip -q install xgboost

[0m

In [21]:
from time import gmtime, strftime
import re

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

s3 = boto3.client("s3")
bucket = sess.default_bucket()
prefix = "churn-prediction-xgboost"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Prepare dataset

In [2]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.drop(columns=["RowNumber", "Surname"], inplace=True)
df = pd.get_dummies(df)
df.drop(columns="Gender_Male", inplace=True)
df

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female
0,15634602,619,42,2,0.00,1,1,1,101348.88,1,1,0,0,1
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1
2,15619304,502,42,8,159660.80,3,1,0,113931.57,1,1,0,0,1
3,15701354,699,39,1,0.00,2,0,0,93826.63,0,1,0,0,1
4,15737888,850,43,2,125510.82,1,1,1,79084.10,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,39,5,0.00,2,1,0,96270.64,0,1,0,0,0
9996,15569892,516,35,10,57369.61,1,1,1,101699.77,0,1,0,0,0
9997,15584532,709,36,7,0.00,1,0,1,42085.58,1,1,0,0,1
9998,15682355,772,42,3,75075.31,2,1,0,92888.52,1,0,1,0,0


In [4]:
X = df.drop(columns="Exited")
y = df["Exited"]

X_train, X_test, y_train, y_test = train_test_split(
 X, y,
 test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(
 X_test, y_test,
 test_size=0.5, random_state=1)

X_train.drop(columns="CustomerId", inplace=True)
X_val.drop(columns="CustomerId", inplace=True)

Upload to S3

In [14]:
train_file = "data/train_data.csv"
pd.concat([y_train, X_train], axis=1).to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, key_prefix="{}/train".format(prefix))

validation_file = "data/validation_data.csv"
pd.concat([y_val, X_val], axis=1).to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, key_prefix="{}/validation".format(prefix))

batch_file = "data/batch_data.csv"
X_test.to_csv(batch_file, index=False, header=False)
sess.upload_data(batch_file, key_prefix="{}/batch".format(prefix))

's3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/batch/batch_data.csv'

## Training job and model creation

Set values for hyperparameters  
Note that we got these values from the experimentation notebook

In [6]:
val_cnts = y_train.value_counts()
scale_pos_weight = val_cnts[0] / val_cnts[1]

learning_rate = 0.07
max_depth = 3

In [7]:
%%time

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=region, version="1.7-1" # latest version
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=3,
    eta=0.07,
    verbosity=0,
    num_round=100,
    scale_pos_weight=scale_pos_weight
)

train_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validation".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-2024-02-10-08-38-22


2024-02-10 08:38:22 Starting - Starting the training job...
2024-02-10 08:38:37 Starting - Preparing the instances for training......
2024-02-10 08:39:42 Downloading - Downloading input data...
2024-02-10 08:40:12 Downloading - Downloading the training image......
2024-02-10 08:41:23 Training - Training image download completed. Training in progress.
2024-02-10 08:41:23 Uploading - Uploading generated training model.[34m[2024-02-10 08:41:18.373 ip-10-0-221-11.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-02-10 08:41:18.395 ip-10-0-221-11.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-02-10:08:41:18:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-02-10:08:41:18:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-02-10:08:41:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-1

## Use batch transform on our test data. Since the ID column is present, we will update the output filter to keep only ID and prediction

Let's change __output_filter__ to "$[0,-1]", indicating that when presenting the output, we only want to keep column 0 (the 'ID') and the last column (the inference result i.e. the probability of a given tumor to be malignant)

In [18]:
sm_transformer = sm_estimator.transformer(1, "ml.m4.xlarge")

sm_transformer.assemble_with = "Line"
sm_transformer.accept = "text/csv"

batch_file = "batch_data.csv"
# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file
)

sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]", # input_filter will filter out CustomerId
    join_source="Input",
    output_filter="$[0,-1]",
)
sm_transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-10-09-11-22-323
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2024-02-10-09-11-23-129


...........................................[34m[2024-02-10:09:18:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-10:09:18:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-10:09:18:37:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    

Let's display the output

In [22]:
def get_csv_output_from_s3(s3uri, batch_file):
    file_name = "{}.out".format(batch_file)
    match = re.match("s3://([^/]+)/(.*)", "{}/{}".format(s3uri, file_name))
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

Unnamed: 0,0,1
0,15731026,0.384791
1,15792565,0.315646
2,15710316,0.6713
3,15781347,0.138538
4,15694859,0.23943
5,15739194,0.27852
6,15723894,0.695615
7,15652527,0.433185
