In [2]:
!pip -q install xgboost

[0m

In [64]:
from time import gmtime, strftime, sleep
from datetime import datetime
import re
from threading import Thread

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import os
import boto3
import sagemaker
from sagemaker import Session, image_uris
from sagemaker.serializers import CSVSerializer
from sagemaker.model import Model
from sagemaker.model_monitor import(DataCaptureConfig,
                                    ModelQualityMonitor,
                                    EndpointInput)
from sagemaker.predictor import Predictor
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.model_monitor.dataset_format import DatasetFormat

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

s3 = boto3.client("s3")
prefix = "churn-prediction-xgboost"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [26]:
# Setup S3 bucket
# You can use a different bucket, but make sure the role you chose for this notebook
# has the s3:PutObject permissions. This is the bucket into which the data is captured

##S3 prefixes
data_capture_prefix = f"{prefix}/datacapture"
s3_capture_upload_path = f"s3://{bucket}/{data_capture_prefix}"

ground_truth_upload_path = (
    f"s3://{bucket}/{prefix}/ground_truth_data/{datetime.now():%Y-%m-%d-%H-%M-%S}"
)

reports_prefix = f"{prefix}/reports"
s3_report_path = f"s3://{bucket}/{reports_prefix}"

##Get the model monitor image
monitor_image_uri = image_uris.retrieve(framework="model-monitor", region=region)

print("Image URI:", monitor_image_uri)
print(f"Capture path: {s3_capture_upload_path}")
print(f"Ground truth path: {ground_truth_upload_path}")
print(f"Report path: {s3_report_path}")

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: .
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Image URI: 156813124566.dkr.ecr.us-east-1.amazonaws.com/sagemaker-model-monitor-analyzer
Capture path: s3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/datacapture
Ground truth path: s3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/ground_truth_data/2024-02-12-23-07-59
Report path: s3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/reports


Prepare dataset

In [4]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.drop(columns=["RowNumber", "Surname"], inplace=True)
df = pd.get_dummies(df)
df.drop(columns="Gender_Male", inplace=True)
df

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female
0,15634602,619,42,2,0.00,1,1,1,101348.88,1,1,0,0,1
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1
2,15619304,502,42,8,159660.80,3,1,0,113931.57,1,1,0,0,1
3,15701354,699,39,1,0.00,2,0,0,93826.63,0,1,0,0,1
4,15737888,850,43,2,125510.82,1,1,1,79084.10,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,39,5,0.00,2,1,0,96270.64,0,1,0,0,0
9996,15569892,516,35,10,57369.61,1,1,1,101699.77,0,1,0,0,0
9997,15584532,709,36,7,0.00,1,0,1,42085.58,1,1,0,0,1
9998,15682355,772,42,3,75075.31,2,1,0,92888.52,1,0,1,0,0


In [5]:
X = df.drop(columns="Exited")
y = df["Exited"]

X_train, X_test, y_train, y_test = train_test_split(
 X, y,
 test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(
 X_test, y_test,
 test_size=0.5, random_state=1)

X_train.drop(columns="CustomerId", inplace=True)
X_val.drop(columns="CustomerId", inplace=True)

Upload to S3

In [6]:
train_file = "data/train_data.csv"
pd.concat([y_train, X_train], axis=1).to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, key_prefix="{}/train".format(prefix))

validation_file = "data/validation_data.csv"
pd.concat([y_val, X_val], axis=1).to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, key_prefix="{}/validation".format(prefix))

batch_file = "data/batch_data.csv"
X_test.to_csv(batch_file, index=False, header=False)
sess.upload_data(batch_file, key_prefix="{}/batch".format(prefix))

's3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/batch/batch_data.csv'

In [55]:
# save validation data with predictions without the CustomerId column
# this will be used for calculating a baseline for monitoring

pd.concat([y_test, X_test.iloc[:, 1:]],
          axis=1).to_csv("data/test_data_noID.csv",
                         index=False, header=False)

## Training job and model creation

Set values for hyperparameters  
Note that we got these values from the experimentation notebook

In [7]:
%%time

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=region, version="1.7-1" # latest version
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    colsample_bytree=0.7956926890881284,
    gamma=0.4224376554847273,
    eta=0.01698158072074776,
    max_depth=4,
    num_round=214,
    subsample=0.7127419099093599
)

train_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validation".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-2024-02-12-22-13-50


2024-02-12 22:13:51 Starting - Starting the training job...
2024-02-12 22:14:08 Starting - Preparing the instances for training.........
2024-02-12 22:15:34 Downloading - Downloading input data...
2024-02-12 22:16:04 Downloading - Downloading the training image......
2024-02-12 22:17:15 Training - Training image download completed. Training in progress...[34m[2024-02-12 22:17:26.895 ip-10-0-218-84.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-02-12 22:17:26.917 ip-10-0-218-84.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-02-12:22:17:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-02-12:22:17:27:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-02-12:22:17:27:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-12:22:17:27:INFO] Running XGBoost Sagemaker in algorithm mode[0

## Use batch transform on our test data. Since the ID column is present, we will update the output filter to keep only ID and prediction

Let's change __output_filter__ to "$[0,-1]", indicating that when presenting the output, we only want to keep column 0 (the 'CustomerId') and the last column (the inference result i.e. the probability of customer churn)

In [8]:
sm_transformer = sm_estimator.transformer(1, "ml.m4.xlarge")

sm_transformer.assemble_with = "Line"
sm_transformer.accept = "text/csv"

batch_file = "batch_data.csv"
# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file
)

sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]", # input_filter will filter out CustomerId
    join_source="Input",
    output_filter="$[0,-1]",
)
sm_transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-12-22-18-03-436
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2024-02-12-22-18-04-696


............................................[34m[2024-02-12:22:25:22:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-12:22:25:22:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-12:22:25:22:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
   

Let's display the output

In [33]:
def get_csv_output_from_s3(s3uri, batch_file):
    file_name = "{}.out".format(batch_file)
    match = re.match("s3://([^/]+)/(.*)", "{}/{}".format(s3uri, file_name))
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

Unnamed: 0,0,1
0,15731026,0.148549
1,15792565,0.132313
2,15710316,0.35572
3,15781347,0.071115
4,15694859,0.098018
5,15739194,0.132254
6,15723894,0.413681
7,15652527,0.212699


Create model entity

In [21]:
model_name = f"final-project-xgb-churn-pred-model-monitor-{datetime.utcnow():%Y-%m-%d-%H%M}"

image_uri = image
model_data = sm_estimator.model_data

model = Model(image_uri=image_uri, model_data=model_data, role=role, sagemaker_session=sess)

Deploy the model with data capture enabled

In [27]:
endpoint_name = f"final-project-xgboost-v1-wquality-monitor-{datetime.utcnow():%Y-%m-%d-%H%M}"
print("EndpointName =", endpoint_name)

data_capture_config = DataCaptureConfig(
    enable_capture=True, sampling_percentage=100, destination_s3_uri=s3_capture_upload_path
)

model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    data_capture_config=data_capture_config,
)

EndpointName = final-project-xgboost-v1-wquality-monitor-2024-02-12-2308
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-12-23-08-12-228
INFO:sagemaker:Creating endpoint-config with name final-project-xgboost-v1-wquality-monitor-2024-02-12-2308
INFO:sagemaker:Creating endpoint with name final-project-xgboost-v1-wquality-monitor-2024-02-12-2308


--------!

Use predictor to invoke the model

In [49]:
predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=session, serializer=CSVSerializer()
)

Generate a baseline for model quality performance

In [56]:
churn_cutoff = 0.5
validate_dataset = "validation_with_predictions.csv"

limit = 200  # Need at least 200 samples to compute standard deviations
i = 0
with open(f"data/{validate_dataset}", "w") as baseline_file:
    baseline_file.write("probability,prediction,label\n")  # our header
    with open("data/test_data_noID.csv", "r") as f:
        for row in f:
            (label, input_cols) = row.split(",", 1)
            probability = float(predictor.predict(input_cols))
            prediction = "1" if probability > churn_cutoff else "0"
            baseline_file.write(f"{probability},{prediction},{label}\n")
            i += 1
            if i > limit:
                break
            print(".", end="", flush=True)
            sleep(0.5)
print()
print("Done!")

........................................................................................................................................................................................................
Done!


examine the file we just created

In [58]:
!head data/validation_with_predictions.csv

probability,prediction,label
0.1485494077205658,0,0
0.1323130875825882,0,0
0.35571980476379395,0,0
0.07111486047506332,0,0
0.09801846742630005,0,0
0.1322542130947113,0,0
0.4136806130409241,0,0
0.21269924938678741,0,0
0.08133785426616669,0,0


Upload the baseline data to s3

In [59]:
baseline_prefix = prefix + "/baselining"
baseline_data_prefix = baseline_prefix + "/data"
baseline_results_prefix = baseline_prefix + "/results"

baseline_data_uri = f"s3://{bucket}/{baseline_data_prefix}"
baseline_results_uri = f"s3://{bucket}/{baseline_results_prefix}"
print(f"Baseline data uri: {baseline_data_uri}")
print(f"Baseline results uri: {baseline_results_uri}")

Baseline data uri: s3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/baselining/data
Baseline results uri: s3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/baselining/results


In [63]:
baseline_dataset_uri = S3Uploader.upload(f"data/{validate_dataset}", baseline_data_uri)
baseline_dataset_uri

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


's3://sagemaker-us-east-1-075039479415/churn-prediction-xgboost/baselining/data/validation_with_predictions.csv'

Create a baselining job

In [65]:
# Create the model quality monitoring object
churn_model_quality_monitor = ModelQualityMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=session,
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: .
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [66]:
# Name of the model quality baseline job
baseline_job_name = f"final-project-xgb-baseline-job-{datetime.utcnow():%Y-%m-%d-%H%M}"

In [None]:
# Execute the baseline suggestion job.
# You will specify problem type, in this case Binary Classification, and provide other required attributes.
job = churn_model_quality_monitor.suggest_baseline(
    job_name=baseline_job_name,
    baseline_dataset=baseline_dataset_uri,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_results_uri,
    problem_type="BinaryClassification",
    inference_attribute="prediction",
    probability_attribute="probability",
    ground_truth_attribute="label",
)
job.wait(logs=False)

INFO:sagemaker:Creating processing-job with name final-project-xgb-baseline-job-2024-02-13-0129


......................................................................

## Upload model to Model Registry

In [19]:
sagemaker = boto3.client("sagemaker")

# model_name = "sagemaker-xgboost-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

model_name = job_name
print(model_name)


info = sagemaker.describe_training_job(TrainingJobName=model_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]

primary_container = {"Image": image, "ModelDataUrl": model_data}

# Save our model to the Sagemaker Model Registry
create_model_response = sagemaker.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])

xgb-2024-02-12-06-24-51
arn:aws:sagemaker:us-east-1:075039479415:model/xgb-2024-02-12-06-24-51


In [21]:
# Inspect Training Job Details
# info

Create the endpoint configuration and endpoint

In [22]:
version_num = 1
endpoint_config_name = f'final-project-xgboost-v{version_num}-endpoint-config' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())                            
                            
instance_type = 'ml.m5.xlarge'

endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "variant1", # The name of the production variant.
            "ModelName": model_name, 
            "InstanceType": instance_type, # Specify the compute instance type.
            "InitialInstanceCount": 1 # Number of instances to launch initially.
        }
    ]
)

print(f"Created EndpointConfig: {endpoint_config_response['EndpointConfigArn']}")


Created EndpointConfig: arn:aws:sagemaker:us-east-1:075039479415:endpoint-config/final-project-xgboost-v1-endpoint-config2024-02-12-07-43-18


In [23]:
# Deploy our model to real-time endpoint

endpoint_name = f'final-project-xgboost-v{version_num}-endpoint' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())                            


create_endpoint_response = sagemaker.create_endpoint(
                                            EndpointName=endpoint_name, 
                                            EndpointConfigName=endpoint_config_name) 

In [32]:
# Wait for endpoint to spin up

sagemaker.describe_endpoint(EndpointName=endpoint_name)

while True:
    print("Getting Job Status")
    res = sagemaker.describe_endpoint(EndpointName=endpoint_name)
    state = res["EndpointStatus"]
    
    if state == "InService":
        print("Endpoint in Service")
        break
    elif state == "Creating":
        print("Endpoint still creating...")
        sleep(60)
    else:
        print("Endpoint Creation Error - Check Sagemaker Console")
        break

Getting Job Status
Endpoint in Service


In [37]:
# Invoke Endpoint

sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=region)

response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name,
                            ContentType='text/csv',
                            Body=X_test.iloc[:, 1:].to_csv(header=None, index=False).strip('\n').split('\n')[0]
                            )
print(response['Body'].read().decode('utf-8'))

0.1485494077205658



In [40]:
# Examine Response Body

# response