## Deploying Multiple Models on a Single Model Endpoint XGBoost

### Training

This step is optional you can also create a model.tar.gz from your pre-trained model artifacts and skip to the Create Model portion of the notebook.

In [None]:
import boto3
import sagemaker
from sagemaker.estimator import Estimator

boto_session = boto3.session.Session()
region = boto_session.region_name

sagemaker_session = sagemaker.Session()
base_job_prefix = 'xgboost-example'
role = sagemaker.get_execution_role()

default_bucket = sagemaker_session.default_bucket()
s3_prefix = base_job_prefix

training_instance_type = 'ml.m5.xlarge'

In [None]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/uci_abalone/train_csv/abalone_dataset1_train.csv .

In [None]:
!aws s3 cp abalone_dataset1_train.csv s3://{default_bucket}/xgboost-regression/train.csv

In [None]:
from sagemaker.inputs import TrainingInput
training_path = f's3://{default_bucket}/xgboost-regression/train.csv'
train_input = TrainingInput(training_path, content_type="text/csv")

In [None]:
model_path = f's3://{default_bucket}/{s3_prefix}/xgb_model'

image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type=training_instance_type,
)

xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    output_path=model_path,
    sagemaker_session=sagemaker_session,
    role=role
)

xgb_train.set_hyperparameters(
    objective="reg:linear",
    num_round=50,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
    silent=0,
)

In [None]:
xgb_train.fit({'train': train_input})

### Retrieve Model Artifacts and Make Copies

Normally you'd have to retrieve the model artifacts for your specific training jobs, but in this case we just make copies of the trained model artifact as an example for setup.

In [None]:
model_artifacts = xgb_train.model_data
model_artifacts

In [None]:
!aws s3 cp {model_artifacts} model.tar.gz

In [None]:
!tar -xf model.tar.gz #untar the trained model.tar.gz

In [None]:
!rm model.tar.gz #remove tarball we will create a new one will all model artifacts

In [None]:
!mkdir models #store the model artifacts in this local directory

In [None]:
import shutil

#make a 100 copies of the xgboost model for our new tar ball, we can change this to the number of models we are expecting
for i in range(100):
    shutil.copy2('xgboost-model', 'models/xgboost-model-{}'.format(i))

In [None]:
!ls models #list out all models in the directory to verify 100 copies

In [None]:
!tar -czvf model.tar.gz -C models . #create new tarball with all the model artifacts

### Create SageMaker Model Object

In [None]:
from sagemaker.xgboost import XGBoostModel
import subprocess
import boto3
import sagemaker
from sagemaker.estimator import Estimator

boto_session = boto3.session.Session()
region = boto_session.region_name

sagemaker_session = sagemaker.Session()
role= sagemaker.get_execution_role()

In [None]:
model_url = sagemaker_session.upload_data(
    path="model.tar.gz", key_prefix="xgboost-multiple-models-tuning"
)

In [None]:
print("Your model artifacts are store here: {}".format(model_url))

### SageMaker Endpoint Creation

#### Untuned Endpoint Creation

Default container settings nothing adjusted for this endpoint.

In [None]:
xgb_estimator_untuned = XGBoostModel(model_data = model_url,
                       entry_point= 'inference.py',
                       role=role,
                       framework_version='1.0-1')

In [None]:
import time
from time import gmtime, strftime
endpoint_name = 'xgboost-untuned' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
xgb_estimator_untuned.deploy(instance_type="ml.c5.2xlarge", initial_instance_count=1, endpoint_name = endpoint_name)

#### Tuned Endpoint Creation

In [None]:
xgb_estimator_tuned = XGBoostModel(model_data = model_url,
                             entry_point= "inference.py",
                             role=role,
                             framework_version='1.0-1',
                             env = {"OMP_NUM_THREADS": '1', "SAGEMAKER_NUM_MODEL_WORKERS": '2'})

In [None]:
endpoint_name_tuned = 'xgboost-tuned' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
xgb_estimator_tuned.deploy(instance_type="ml.c5.2xlarge", initial_instance_count=1, endpoint_name = endpoint_name_tuned)

### Sample Inference

In [None]:
import boto3
smr = boto3.client('sagemaker-runtime') #client for inference

In [None]:
untuned_endpoint_name = "xgboost-untuned2023-03-07-03-02-56"
tuned_endpoint_name = "xgboost-tuned2023-03-07-03-16-37"

In [None]:
import json
content_type = "application/json"
inpPayload = b'.345,0.224414,.131102,0.042329,.279923,-0.110329,-0.099358,0.0'
request_body = {"input": inpPayload.decode("utf-8"), 
               "models": ['xgboost-model-0', 'xgboost-model-93', 'xgboost-model-69', 'xgboost-model-50',
                         'xgboost-model-51', 'xgboost-model-52', 'xgboost-model-53', 'xgboost-model-54',
                         'xgboost-model-55']}

#Serialize data for endpoint
data = json.loads(json.dumps(request_body))
payload = json.dumps(data)
payload

In [None]:
response = smr.invoke_endpoint(
    EndpointName=untuned_endpoint_name,
    ContentType="application/json",
    Body=payload)

#Parse results
result = json.loads(response['Body'].read().decode())
result

In [None]:
response = smr.invoke_endpoint(
    EndpointName=tuned_endpoint_name,
    ContentType="application/json",
    Body=payload)

#Parse results
result = json.loads(response['Body'].read().decode())
result

### Sequential Test

In [None]:
import numpy as np 
import datetime
import math
import time
import matplotlib.pyplot as plt
import random

total_runs = 500

client_times = []
errors_list = []

errors = 0

cw_start = datetime.datetime.utcnow()

content_type = "application/json" 

for _ in range(total_runs):
    client_start = time.time()
    response = smr.invoke_endpoint(
        EndpointName=untuned_endpoint_name,
        ContentType=content_type,
        Body=payload)
    client_end = time.time()
    client_times.append((client_end - client_start)*1000)
    
cw_end = datetime.datetime.utcnow()    

cw_duration = cw_end - cw_start 
duration_in_s = cw_duration.total_seconds() 

tps = total_runs/duration_in_s

print('\nErrors - {:.4f} out of {:.4f} total runs | {:.4f}% in {:.4f} seconds \n'.format(errors, total_runs, (errors/total_runs)*100, duration_in_s))
errors = 0

print('\nTPS: {:.4f}'.format(tps))
    
print('Client end-to-end latency percentiles:')
client_avg = np.mean(client_times)
client_p50 = np.percentile(client_times, 50)
client_p90 = np.percentile(client_times, 90)
client_p95 = np.percentile(client_times, 95)
client_p100 = np.percentile(client_times, 100)
print('Avg | P50 | P90 | P95 | P100')
print('{:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.4f} \n'.format(client_avg, client_p50, client_p90, client_p95, client_p100))

# Give 5 minute buffer to end
cw_end += datetime.timedelta(minutes=5)

In [None]:
import numpy as np 
import datetime
import math
import time
import matplotlib.pyplot as plt
import random

total_runs = 500

client_times = []
errors_list = []

errors = 0

cw_start = datetime.datetime.utcnow()

content_type = "application/json" 

for _ in range(total_runs):
    client_start = time.time()
    response = smr.invoke_endpoint(
        EndpointName=tuned_endpoint_name,
        ContentType=content_type,
        Body=payload)
    client_end = time.time()
    client_times.append((client_end - client_start)*1000)
    
cw_end = datetime.datetime.utcnow()    

cw_duration = cw_end - cw_start 
duration_in_s = cw_duration.total_seconds() 

tps = total_runs/duration_in_s

print('\nErrors - {:.4f} out of {:.4f} total runs | {:.4f}% in {:.4f} seconds \n'.format(errors, total_runs, (errors/total_runs)*100, duration_in_s))
errors = 0

print('\nTPS: {:.4f}'.format(tps))
    
print('Client end-to-end latency percentiles:')
client_avg = np.mean(client_times)
client_p50 = np.percentile(client_times, 50)
client_p90 = np.percentile(client_times, 90)
client_p95 = np.percentile(client_times, 95)
client_p100 = np.percentile(client_times, 100)
print('Avg | P50 | P90 | P95 | P100')
print('{:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.4f} \n'.format(client_avg, client_p50, client_p90, client_p95, client_p100))

# Give 5 minute buffer to end
cw_end += datetime.timedelta(minutes=5)