In [None]:
import sagemaker
import boto3

from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.serializers import DataSerializer

In [None]:
output_path = "" # set your output path for async inference
failure_path = "" # set your failure path for async inference

In [None]:
try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [None]:
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'distil-whisper/distil-medium.en',
	'HF_TASK':'automatic-speech-recognition'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.26.0',
	pytorch_version='1.13.1',
	py_version='py39',
	env=hub,
	role=role, 
)

In [None]:
# create async endpoint configuration
async_config = AsyncInferenceConfig(
    output_path=output_path,
	failure_path=failure_path
)

In [None]:
audio_serializer = DataSerializer(content_type='audio/x-audio')

env = {
    "MODEL_SERVER_WORKERS": "1",
    "MMS_MAX_REQUEST_SIZE": str(500*1024*1024),  #instead of default ~6.2MiB
    "MMS_MAX_RESPONSE_SIZE": str(500*1024*1024),  #instead of default ~6.2MiB
}

endpoint_name="distil-whisper-async"

In [None]:
# deploy model to SageMaker Inference (with async_config)
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    async_inference_config=async_config,
    endpoint_name=endpoint_name,
    serializer=audio_serializer,
    env=env,
)