In [None]:
! pip install -U sagemaker

## Create an endpoint from hugginface_model

https://aws.amazon.com/blogs/machine-learning/configuring-autoscaling-inference-endpoints-in-amazon-sagemaker/

https://medium.com/p/b1b6e6731c59


In [None]:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'openpecha/wav2vec2_run9',
	'HF_TASK':'automatic-speech-recognition'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.37.0',
	pytorch_version='2.1.0',
	py_version='py310',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.m5.xlarge' # ec2 instance type
)

Check if the endpoint works

In [None]:

from sagemaker.serializers import DataSerializer
	
predictor.serializer = DataSerializer(content_type='audio/x-audio')

# Make sure the input file exists
with open("/home/ec2-user/SageMaker/stt-split-audio/STT_NS/segments_ns/STT_NS_M0322_0400_2713489_to_2721015.wav", "rb") as f:
	data = f.read()
predictor.predict(data)

In [None]:
use invoke_endpoint to run inference

In [None]:
import os
import io
import boto3
import json
import csv

# grab environment variables
ENDPOINT_NAME = 'huggingface-pytorch-inference-2024-02-21-10-00-23-117'
runtime= boto3.client('runtime.sagemaker')

with open("/home/ec2-user/SageMaker/stt-split-audio/STT_NS/segments_ns/STT_NS_M0322_0400_2713489_to_2721015.wav", "rb") as f:
	data = f.read()

response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                      ContentType='audio/x-audio',
                                      Body=data)
    
result = json.loads(response['Body'].read().decode())
inference = {"inference": result['text']}
response_dict = {
          "statusCode": 200,
          "body": inference
                }
print(response_dict)

Check the endpoint description

In [None]:
import pprint
import boto3
from sagemaker import get_execution_role
import sagemaker
import json

pp = pprint.PrettyPrinter(indent=4, depth=4)
role = get_execution_role()
sagemaker_client = boto3.Session().client(service_name='sagemaker')
endpoint_name = 'huggingface-pytorch-inference-2024-02-21-10-00-23-117'
response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
pp.pprint(response)

# make the endpoint autoscaling 

In [None]:
client = boto3.client('application-autoscaling') # Common class representing Application Auto Scaling for SageMaker amongst other services

endpoint_name = 'huggingface-pytorch-inference-2024-02-21-10-00-23-117'
resource_id='endpoint/' + endpoint_name + '/variant/' + 'AllTraffic' # This is the format in which application autoscaling references the endpoint

response = client.register_scalable_target(
    ServiceNamespace='sagemaker', #
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=1,
    MaxCapacity=5
)


#Example 1 - SageMakerVariantInvocationsPerInstance Metric
response = client.put_scaling_policy(
    PolicyName='Invocations-ScalingPolicy',
    ServiceNamespace='sagemaker', # The namespace of the AWS service that provides the resource. 
    ResourceId=resource_id, # Endpoint name 
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', # SageMaker supports only Instance Count
    PolicyType='TargetTrackingScaling', # 'StepScaling'|'TargetTrackingScaling'
    TargetTrackingScalingPolicyConfiguration={
        'TargetValue': 10.0, # The target value for the metric. - here the metric is - SageMakerVariantInvocationsPerInstance
        'PredefinedMetricSpecification': {
            'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance', # is the average number of times per minute that each instance for a variant is invoked. 
        },
        'ScaleInCooldown': 600, # The cooldown period helps you prevent your Auto Scaling group from launching or terminating 
                                # additional instances before the effects of previous activities are visible. 
                                # You can configure the length of time based on your instance startup time or other application needs.
                                # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start. 
        'ScaleOutCooldown': 300 # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.
        
        # 'DisableScaleIn': True|False - ndicates whether scale in by the target tracking policy is disabled. 
                            # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.
    }
)

## check the scaling policy

In [None]:
response = client.describe_scaling_policies(
    ServiceNamespace='sagemaker'
)

for i in response['ScalingPolicies']:
    print('')
    pp.pprint(i['PolicyName'])
    print('')
    if('TargetTrackingScalingPolicyConfiguration' in i):
        pp.pprint(i['TargetTrackingScalingPolicyConfiguration']) 
    else:
        pp.pprint(i['StepScalingPolicyConfiguration'])
    print('')

## see how many instances is running

In [None]:
sagemaker_client = boto3.Session().client(service_name='sagemaker')
response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
status = response['EndpointStatus']
instance_count = response['ProductionVariants'][0]['CurrentInstanceCount']

print("Status: " + status)
print(f"Current Instance count: {instance_count}")

## increase or decrease the number of instances manually

In [None]:
response = sagemaker_client.update_endpoint_weights_and_capacities(EndpointName=endpoint_name,
                            DesiredWeightsAndCapacities=[
                                {
                                    'VariantName': 'AllTraffic',
                                    'DesiredInstanceCount': 2
                                }
                            ])