In [2]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()

In [3]:
# Hub Model configuration. https://huggingface.co/models
hub = {
    'HF_MODEL_ID':'distilbert-base-uncased',
    'HF_TASK':'text-classification'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    transformers_version='4.17.0',
    pytorch_version='1.10.2',
    py_version='py38',
    env=hub,
    role=role, 
)

In [4]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1, # number of instances
    instance_type='ml.c5.xlarge' # ec2 instance type
)

-----!

In [5]:
predictor.predict({
    'inputs': "I am super happy right now."
})

[{'label': 'LABEL_0', 'score': 0.5521041750907898}]

In [19]:
import boto3
import json
client = boto3.client('sagemaker-runtime')
content_type = "application/json"
request_body = {'inputs': "I am super happy right now."}
data = json.loads(json.dumps(request_body))
payload = json.dumps(data)
response = client.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType=content_type,
    Body=payload)
result = response['Body'].read()
result

b'[{"label":"LABEL_0","score":0.5521041750907898}]'

In [23]:
request_body = {'inputs': "I am super happy right now."}
data = json.loads(json.dumps(request_body))
payload = json.dumps(data)
payload

'{"inputs": "I am super happy right now."}'

In [21]:
#!pip install locust

In [22]:
!which locust

/home/ec2-user/anaconda3/envs/python3/bin/locust


In [24]:
import boto3
from botocore.config import Config
import pandas as pd
import itertools
import datetime

region = 'us-east-1'
content_type = 'application/json'
endpoint_name='huggingface-pytorch-inference-2022-07-25-17-21-31-881'

boto3config = Config(
    retries={
        'max_attempts': 100,
        'mode': 'standard'
    }
)

payload='{"inputs": "I am super happy right now."}'

sagemaker_client = boto3.client('sagemaker-runtime',
                                     config=boto3config,
                                     region_name=region)

response = sagemaker_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=payload,
    ContentType=content_type
)
response_body = response["Body"].read()

response_body

b'[{"label":"LABEL_1","score":0.5474691987037659}]'

In [25]:
cw_start = datetime.datetime.utcnow()
!/home/ec2-user/anaconda3/envs/python3/bin/locust -f locust_script.py -u 10 --headless --host=http://huggingface-pytorch-inference-2022-07-25-17-21-31-881 --stop-timeout 90 -L DEBUG -t 5m --logfile=logfile.log --csv=locust.csv --csv-full-history --reset-stats              
cw_end = datetime.datetime.utcnow() 

 Name  # reqs      # fails  |     Avg     Min     Max  Median  |   req/s failures/s
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 Aggregated       0     0(0.00%)  |       0       0       0       0  |    0.00    0.00

huggingface-pytorch-inference-2022-07-25-17-21-31-881
huggingface-pytorch-inference-2022-07-25-17-21-31-881
 Name  # reqs      # fails  |     Avg     Min     Max  Median  |   req/s failures/s
--------------------------------------------------------------------------------
 custom_protocol_boto3 sagemaker_client_invoke_endpoint      41     0(0.00%)  |      68      56     127      63  |    0.00    0.00
--------------------------------------------------------------------------------
 Aggregated      41     0(0.00%)  |      68      56     127      63  |    0.00    0.00

huggingface-pytorch-inference-2022-07-25-17-21-31-881
huggingface-pytorch-inference-2022-07-2

In [27]:
locust_data = pd.read_csv('locust.csv_stats.csv')
for index, row in locust_data.head(n=2).iterrows():
     print(index, row)

0 Type                                custom_protocol_boto3
Name                     sagemaker_client_invoke_endpoint
Request Count                                       12836
Failure Count                                           0
Median Response Time                                  220
Average Response Time                             225.559
Min Response Time                                     158
Max Response Time                                    1793
Average Content Size                                   48
Requests/s                                        44.2107
Failures/s                                              0
50%                                                   220
66%                                                   230
75%                                                   240
80%                                                   240
90%                                                   250
95%                                                   260
98%         