## BERT Custom Inference Script SageMaker Deployment

Follow this repository from HuggingFace as a base: https://github.com/huggingface/notebooks/blob/main/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb

In [None]:
!pip install sagemaker>=2.127.0 transformers==4.12.0 

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
import os
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

### Local Inference Test

This will make it easier for you to debug your inference script that you pass to SageMaker. You can understand input and output functionality beforehand essentially in terms of data format and more.

In [None]:
samp = "This is a test string that I am trying out with the BERT model."
encoded_input = tokenizer(samp, padding=True, truncation=True, return_tensors='pt')

In [None]:
encoded_input

In [None]:
import torch
with torch.no_grad():
    model_output = model(**encoded_input)[0]
res = model_output.flatten().tolist()
#res

### Create model.tar.gz

In [None]:
save_dir="tmp"
os.makedirs(save_dir,exist_ok=True)

In [None]:
tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)
model.config.save_pretrained(save_dir)

### Create Inference Script

In [None]:
!mkdir code

In [None]:
%%writefile code/inference.py

from transformers import BertTokenizer, BertModel
import torch

def model_fn(model_dir):
  # Load model from HuggingFace Hub
  tokenizer = BertTokenizer.from_pretrained(model_dir)
  model = BertModel.from_pretrained(model_dir)
  return model, tokenizer

def predict_fn(data, model_and_tokenizer):
    # destruct model and tokenizer
    model, tokenizer = model_and_tokenizer
    
    # Tokenize sentences
    sentences = data.pop("inputs", data)
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Model Inference
    with torch.no_grad():
        model_output = model(**encoded_input)[0]
    print(model_output)
    print(type(model_output))
    # return dictonary, which will be json serializable
    return {"vectors": model_output.flatten().tolist()}

In [None]:
# copy inference.py into the code/ directory of the model directory.
!cp -r code/ tmp/code/
# create a model.tar.gz archive with all the model artifacts and the inference.py script.
%cd tmp
!tar zcvf model.tar.gz *
%cd ..

In [None]:
from sagemaker.s3 import S3Uploader
# create s3 uri
s3_model_path = f"s3://{sess.default_bucket()}/"+"bert-mars"
print(s3_model_path)

# upload model.tar.gz
s3_model_uri = S3Uploader.upload(local_path="tmp/model.tar.gz",desired_s3_uri=s3_model_path)
print(f"model artifcats uploaded to {s3_model_uri}")

### Create Model and Endpoint

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_model_uri,       # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.12",  # transformers version used
   pytorch_version="1.9",        # pytorch version used
   py_version='py38',            # python version used
)

In [None]:
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,      # number of instances
    instance_type="ml.c5.xlarge" #instance type
)

In [None]:
import json
import boto3
runtime = boto3.client(service_name="sagemaker-runtime")

embedding_gen_payload = json.dumps({'inputs': "The company HuggingFace is based in New York City"})
response = runtime.invoke_endpoint(
        EndpointName=predictor.endpoint_name,
        ContentType="application/json",
        Body=embedding_gen_payload)

response['Body'].read()