## Multi Container endpoints

In this example we will deploy two different models for summarization and Q&A tasks.
Please note that loading and packaging models may take several minutes.

In [None]:
# Download artifacts for DistilBert model for Question-Answering task

! mkdir -p distilbert-base-uncased-distilled-squad/1
! mkdir -p distilbert-base-uncased-distilled-squad/code

! wget https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/saved_model.tar.gz
! tar -zxvf saved_model.tar.gz -C distilbert-base-uncased-distilled-squad/1

! cp 1_src/inference.py distilbert-base-uncased-distilled-squad/code
! cp 1_src/requirements.txt distilbert-base-uncased-distilled-squad/code


In [2]:
!tar -C "$PWD" -czf distilbert-base-uncased-distilled-squad.tar.gz distilbert-base-uncased-distilled-squad/

In [6]:
# Download artifacts for Bart model for Summarization task

! mkdir -p distilbart-cnn-6-6/code
! wget https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/pytorch_model.bin -P distilbart-cnn-6-6
! wget https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/tokenizer.json -P distilbart-cnn-6-6
! wget https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/vocab.json -P distilbart-cnn-6-6
! wget https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/config.json -P distilbart-cnn-6-6
! wget https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/merges.txt -P distilbart-cnn-6-6
! cp 1_src/inference.py distilbart-cnn-6-6/code
! cp 1_src/requirements.txt distilbart-cnn-6-6/code

In [27]:
!tar -C "$PWD" -czf distilbart-cnn-6-6.tar.gz distilbart-cnn-6-6/

### Upload model data to S3

In [1]:

import sagemaker
from sagemaker import get_execution_role
import os

sagemaker_session = sagemaker.Session()
role = get_execution_role()

bucket = sagemaker_session.default_bucket()
prefix = 'multi-container'
s3_path = 's3://{}/{}'.format(bucket, prefix)


In [28]:
qa_model_data = sagemaker_session.upload_data('distilbert-base-uncased-distilled-squad.tar.gz',
                                           bucket,
                                           os.path.join(prefix, 'model-artifacts'))

summarization_model_data = sagemaker_session.upload_data('distilbart-cnn-6-6.tar.gz',
                                           bucket,
                                           os.path.join(prefix, 'model-artifacts'))    

# Create Inference script

In [None]:
! pygmentize 1_src/inference.py

In [121]:
### For TensorFlow we need to check signature of the model

! saved_model_cli show --dir distilbert-base-uncased-distilled-squad/1 --tag_set serve --signature_def serving_default

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
The given SavedModel SignatureDef contains the following input(s):
  inputs['attention_mask'] tensor_info:
      dtype: DT_INT32
      shape: (-1, 384)
      name: serving_default_attention_mask:0
  inputs['input_ids'] tensor_info:
      dtype: DT_INT32
      shape: (-1, 384)
      name: serving_default_input_ids:0
The given SavedModel SignatureDef contains the following output(s):
  outputs['output_0'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, 384)
      name: StatefulPartitionedCall:0
  outputs['output_1'] tensor_info:
      dtype: DT_FLOAT
      shape: (-1, 384)
      name: StatefulPartitionedCall:1
Method name is: tensorflow/serving/predict


# Deploy Multi Container Endpoint


In [64]:
region = sagemaker_session.boto_region_name

instance_type = "ml.m5.4xlarge"

In [65]:
summarization_env = {
    "NLP_TASK" : "summarization",
    "SAGEMAKER_PROGRAM" : "inference.py",
    "SAGEMAKER_SUBMIT_DIRECTORY": summarization_model_data,
}

pt_inference_image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=region,
    version="1.9.0",
    py_version="py38",
    instance_type=instance_type,
    image_scope="inference",
)

pytorch_container = {
    "ContainerHostname": "pytorch-bart-summarizer",
    "Image": pt_inference_image_uri,
    "ModelDataUrl": summarization_model_data,
    "Environment" : summarization_env
}


In [66]:
qa_env = {
    "NLP_TASK" : "question-answering"
}

tf_inference_image_uri = sagemaker.image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.8",
    py_version="py38",
    instance_type=instance_type,
    image_scope="inference",
)

tensorflow_container = {
    "ContainerHostname": "tensorflow-distilbert-qa",
    "Image": tf_inference_image_uri,
    "ModelDataUrl": qa_model_data,
    "Environment" : qa_env
}


In [67]:
print(pytorch_container)
print(tensorflow_container)

{'ContainerHostname': 'pytorch-bart-summarizer', 'Image': '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.8.1-cpu-py36', 'ModelDataUrl': 's3://sagemaker-us-east-1-941656036254/multi-container/model-artifacts/distilbart-cnn-6-6.tar.gz', 'Environment': {'NLP_TASK': 'summarization', 'SAGEMAKER_PROGRAM': 'inference.py', 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://sagemaker-us-east-1-941656036254/multi-container/model-artifacts/distilbart-cnn-6-6.tar.gz'}}
{'ContainerHostname': 'tensorflow-distilbert-qa', 'Image': '763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-inference:2.8-cpu', 'ModelDataUrl': 's3://sagemaker-us-east-1-941656036254/multi-container/model-artifacts/distilbert-base-uncased-distilled-squad.tar.gz', 'Environment': {'NLP_TASK': 'question-answering'}}


## Create Multi Container Endpoint

In [68]:
sm_client = sagemaker_session.sagemaker_client
runtime_sm_client = sagemaker_session.sagemaker_runtime_client



In [69]:
import datetime

unique_id = datetime.datetime.now().strftime("%Y-%m-%d%H-%M-%S")

model_name = f"mce-nlp-model-{unique_id}"

create_model_response = sm_client.create_model(
    ModelName=model_name,
    Containers=[tensorflow_container, pytorch_container],
    InferenceExecutionConfig={"Mode": "Direct"},
    ExecutionRoleArn=role,
)

In [70]:
endpoint_config_name = f"{model_name}-ep-config"

endpoint_config = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "prod",
            "ModelName": model_name,
            "InitialInstanceCount": 1,
            "InstanceType": instance_type,
        },
    ],
)

In [71]:
endpoint_name = f"{model_name}-ep"

endpoint = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

In [56]:
print(endpoint_name)
print(model_name)

mce-nlp-model-2022-08-1609-34-19-ep
mce-nlp-model-2022-08-1609-34-19


## Testing Multi Container Endpoint

In [57]:
import json

article = r"""
The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.
"""

question="What is Spanish name for Amazon?"


In [58]:
summarization_input = {"article":article, "max_length":100}

In [21]:
#  preparing data for TF Serving format

from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf
import numpy as np

max_length = 384
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

encoded_input = tokenizer(question, article, padding='max_length', max_length=max_length)
encoded_input = dict(encoded_input)
qa_inputs = [{"input_ids": np.array(encoded_input["input_ids"]).tolist(), "attention_mask":np.array(encoded_input["attention_mask"]).tolist()}]
qa_inputs = {"instances" : qa_inputs}

Some layers from the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased-distilled-squad and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import numpy as np

tf_response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Accept="application/json",
    TargetContainerHostname="tensorflow-distilbert-qa",
    Body=json.dumps(qa_inputs),
)

In [23]:
predictions = json.loads(tf_response["Body"].read().decode())

In [24]:
answer_start_index = int(tf.math.argmax(predictions['predictions'][0]['output_0']))
answer_end_index = int(tf.math.argmax(predictions['predictions'][0]['output_1']))

predict_answer_tokens = encoded_input["input_ids"][answer_start_index : answer_end_index + 1]
tf_response = tokenizer.decode(predict_answer_tokens)

print(f"Question: {question}, answer: {tf_response}")


Question: What is Spanish name for Amazon?, answer: Selva Amazónica


In [None]:
pt_result = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Accept="application/json",
    TargetContainerHostname="pytorch-bart-summarizer", 
    Body=json.dumps(summarization_input),
)

In [None]:
sm_client.delete_endpoint(endpoint_name)