# Chapter 9: Optimize Model Hosting and Inference Costs

In this chapter we'll explore different techniques for optimizing model hosting and inference costs and performance.

In order to complete this chapter, you need to fill out the following inputs:

* `s3_bucket`: The S3 bucket containing your data
* `s3_prefix`: The folder in the S3 bucket containing the prepared data set
* `s3_prefix_parquet`: The location of the Parquet tables in S3
* `s3_output_prefix`: The location for new data output in S3
* `region`: The AWS region you're working in
* `m_prefix`: The folder in the S3 bucket to store temporary models and output
* `test_file_name`: The name of a file in the test data set

In [None]:
s3_bucket = 'MyBucket'
s3_prefix = 'prepared'
s3_prefix_parquet = 'openaq/realtime-parquet-gzipped/tables'
s3_output_prefix = 'prepared_param'
region = 'us-east-1'
m_prefix = 'xgboost-sample'
test_file_name = 'part-0000.csv'

## Real-time and Batch Inference

In [None]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"5"}

# set an output path where the trained model will be saved

output_path = 's3://{}/{}/{}/output'.format(s3_bucket, m_prefix, 'xgboost')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.12xlarge', 
                                          volume_size=200, # 5 GB 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}/".format(s3_bucket, s3_prefix, 'train'), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}/".format(s3_bucket, s3_prefix, 'validation'), content_type=content_type)

# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

In [None]:
batch_input = "s3://{}/{}/{}/".format(s3_bucket, s3_prefix, 'test')
batch_output = "s3://{}/{}/{}/".format(s3_bucket, "xgboost-sample", 'xform')
transformer = estimator.transformer(instance_count=1, instance_type='ml.m5.4xlarge', output_path=batch_output, max_payload=3)
transformer.transform(data=batch_input, data_type='S3Prefix', content_type=content_type, split_type='Line')

In [None]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer
predictor = estimator.deploy(initial_instance_count=1,
                            instance_type='ml.m5.2xlarge',
                            serializer=CSVSerializer(),
                            deserializer=JSONDeserializer()
                             )

In [None]:
import boto3
s3 = boto3.client('s3')
s3.download_file(s3_bucket, f"{s3_prefix}/test/{test_file_name}", 't_file.csv')

In [None]:
with open('t_file.csv', 'r') as TF:
    t_lines = TF.readlines()

In [None]:
for tl in t_lines[0:5]:
    result = predictor.predict(tl.strip())
    print(result)

## A/B Testing

In this section we'll run two versions of the same model in an endpoint.

In [None]:
hyperparameters_v2 = {
        "max_depth":"10",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"5"}

estimator_v2 = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.12xlarge', 
                                          volume_size=200, # 5 GB 
                                          output_path=output_path)

predictor_v2 = estimator_v2.deploy(initial_instance_count=1,
                            instance_type='ml.m5.2xlarge',
                            serializer=CSVSerializer(),
                            deserializer=JSONDeserializer()
                             )

In [None]:
model1 = predictor._model_names[0]
model2 = predictor_v2._model_names[0]

In [None]:
from sagemaker.session import production_variant

variant1 = production_variant(model_name=model1,
                              instance_type="ml.m5.xlarge",
                              initial_instance_count=1,
                              variant_name='Variant1',
                              initial_weight=1)
variant2 = production_variant(model_name=model2,
                              instance_type="ml.m5.xlarge",
                              initial_instance_count=1,
                              variant_name='Variant2',
                              initial_weight=1)

In [None]:
from sagemaker.session import Session

smsession = Session()

smsession.endpoint_from_production_variants(
    name='mmendpoint',
    production_variants=[variant1, variant2]
)

In [None]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer
import boto3
from botocore.response import StreamingBody

smrt = boto3.Session().client("sagemaker-runtime")

for tl in t_lines[0:50]:
    result = smrt.invoke_endpoint(EndpointName='mmendpoint',
                                   ContentType="text/csv",
                                   Body=tl.strip())
    rbody = StreamingBody(raw_stream=result['Body'],content_length=int(result['ResponseMetadata']['HTTPHeaders']['content-length']))
    print(f"Result from {result['InvokedProductionVariant']} = {rbody.read().decode('utf-8')}")


## Multiple models in a single endpoint

In this section we'll create an endpoint that serves traffic for different air quality parameters using different models.

In [None]:
from sagemaker.spark.processing import PySparkProcessor

spark_processor = PySparkProcessor(
    base_job_name="spark-preprocessor",
    framework_version="3.0",
    role=sagemaker.get_execution_role(),
    instance_count=15,
    instance_type="ml.m5.4xlarge",
    max_runtime_in_seconds=7200,
)

configuration = [
    {
    "Classification": "spark-defaults",
    "Properties": {"spark.executor.memory": "18g", 
                   "spark.yarn.executor.memoryOverhead": "3g",
                   "spark.driver.memory": "18g",
                   "spark.yarn.driver.memoryOverhead": "3g",
                   "spark.executor.cores": "5", 
                   "spark.driver.cores": "5",
                   "spark.executor.instances": "44",
                   "spark.default.parallelism": "440",
                   "spark.dynamicAllocation.enabled": "false"
                  },
    },
    {
    "Classification": "yarn-site",
    "Properties": {"yarn.nodemanager.vmem-check-enabled": "false", 
                   "yarn.nodemanager.mmem-check-enabled": "false"},
    }
]

spark_processor.run(
    submit_app="scripts/preprocess_param.py",
    submit_jars=["s3://crawler-public/json/serde/json-serde.jar"],
    arguments=['--s3_input_bucket', s3_bucket,
               '--s3_input_key_prefix', s3_prefix_parquet,
               '--s3_output_bucket', s3_bucket,
               '--s3_output_key_prefix', f"{s3_output_prefix}/pm25",
               '--parameter', 'pm25',],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(s3_bucket, 'sparklogs'),
    logs=True,
    configuration=configuration
)

spark_processor.run(
    submit_app="scripts/preprocess_param.py",
    submit_jars=["s3://crawler-public/json/serde/json-serde.jar"],
    arguments=['--s3_input_bucket', s3_bucket,
               '--s3_input_key_prefix', s3_prefix_parquet,
               '--s3_output_bucket', s3_bucket,
               '--s3_output_key_prefix', f"{s3_output_prefix}/o3",
               '--parameter', 'o3',],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(s3_bucket, 'sparklogs'),
    logs=True,
    configuration=configuration
)

In [None]:
output_path = 's3://{}/{}/{}/output'.format(s3_bucket, m_prefix, 'o3')

estimator_o3 = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.12xlarge', 
                                          volume_size=200,  
                                          output_path=output_path)

content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}/{}/".format(s3_bucket, s3_output_prefix, 'o3', 'train'), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}/{}/".format(s3_bucket, s3_output_prefix, 'o3', 'validation'), content_type=content_type)

# execute the XGBoost training job
estimator_o3.fit({'train': train_input, 'validation': validation_input})

In [None]:
output_path = 's3://{}/{}/{}/output'.format(s3_bucket, m_prefix, 'pm25')

estimator_pm25 = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.12xlarge', 
                                          volume_size=200, 
                                          output_path=output_path)

content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}/{}/".format(s3_bucket, s3_output_prefix, 'pm25', 'train'), content_type=content_type)
validation_input = TrainingInput("s3://{}/{}/{}/{}/".format(s3_bucket, s3_output_prefix, 'pm25', 'validation'), content_type=content_type)

# execute the XGBoost training job
estimator_pm25.fit({'train': train_input, 'validation': validation_input})

In [None]:
model = estimator_o3.create_model(role=sagemaker.get_execution_role(), image_uri=xgboost_container)

from sagemaker.multidatamodel import MultiDataModel
model_data_prefix = f's3://{s3_bucket}/{m_prefix}/mma/'

model_name = 'xgboost-mma'
mme = MultiDataModel(name=model_name,
                     model_data_prefix=model_data_prefix,
                     model=model) 

In [None]:
predictor = mme.deploy(initial_instance_count=1,
                       instance_type='ml.m5.2xlarge',
                       endpoint_name=model_name,
                      serializer=CSVSerializer(),
                           deserializer=JSONDeserializer())

In [None]:
for est in [estimator_o3, estimator_pm25]:
    artifact_path = est.latest_training_job.describe()['ModelArtifacts']['S3ModelArtifacts']
    #print(artifact_path)
    m_name = artifact_path.split('/')[4]+'.tar.gz'
    #print(m_name)
    # This is copying over the model artifact to the S3 location for the MME.
    mme.add_model(model_data_source=artifact_path, model_data_path=m_name)
    
list(mme.list_models())

In [None]:
s3.download_file(s3_bucket, f"{s3_output_prefix}/pm25/test/part-00120-81a51ddd-c8b5-47d0-9431-0a5da6158754-c000.csv", 'pm25.csv')
s3.download_file(s3_bucket, f"{s3_output_prefix}/o3/test/part-00214-ae1a5b74-e187-4b62-ae4a-385afcbaa766-c000.csv", 'o3.csv')

In [None]:
with open('pm25.csv', 'r') as TF:
    pm_lines = TF.readlines()
with open('o3.csv', 'r') as TF:
    o_lines = TF.readlines()

In [None]:
for tl in pm_lines[0:5]:
    result = predictor.predict(data = tl.strip(), target_model='pm25.tar.gz')
    print(result)

for tl in o_lines[0:5]:
    result = predictor.predict(data = tl.strip(), target_model='o3.tar.gz')
    print(result)

## Elastic Inference

In this section we'll add elastic inference capacity to an existing endpoint.

In [None]:
predictor_ei = predictor.deploy(initial_instance_count = 1, instance_type = 'ml.m5.xlarge', 
                                 serializer=CSVSerializer(),
                                deserializer=JSONDeserializer(),
                                   accelerator_type='ml.eia2.medium')

## Model optimization with SageMaker Neo

In [None]:
ncols = len(t_lines[0].split(','))

In [None]:
import sagemaker
from sagemaker.model import Model

n_prefix = 'xgboost-sample-neo'
n_output_path = 's3://{}/{}/{}/output'.format(s3_bucket, n_prefix, 'xgboost-neo')

m1 = Model(xgboost_container, 
           model_data=estimator.latest_training_job.describe()['ModelArtifacts']['S3ModelArtifacts'], 
           role=sagemaker.get_execution_role())
neo_model = m1.compile('ml_m5', 
           {'data':[1, ncols]}, 
           n_output_path, 
           sagemaker.get_execution_role(), 
           framework='xgboost', 
           framework_version='latest',
           job_name = 'neojob')

In [None]:
neo_predictor = neo_model.deploy(initial_instance_count = 1, instance_type = 'ml.m5.xlarge', 
                                 serializer=CSVSerializer(),
                                deserializer=JSONDeserializer(),
                                endpoint_name='neo_endpoint')

In [None]:
for tl in t_lines[0:5]:
    result = smrt.invoke_endpoint(EndpointName='neo_endpoint',
                                   ContentType="text/csv",
                                   Body=tl.strip())
    rbody = StreamingBody(raw_stream=result['Body'],content_length=int(result['ResponseMetadata']['HTTPHeaders']['content-length']))
    print(f"Result from {result['InvokedProductionVariant']} = {rbody.read().decode('utf-8')}")