
# <span style="color:DarkSeaGreen">JumpStart Lab 2</span>

This lab does the following:

- Uses the endpoint created in Lab 1
- Implements SageMaker application-autoscaling
- Tests the functionality 



# <span style="color:DarkSeaGreen">Prepare Your Environment</span>
### Note if you want a venv, see Lab 1

# Lab 2 Starts Here!

# <span style="color:DarkSeaGreen">Setup</span>

In [15]:
# region
# for the purpose of this lab, us-east-1, us-west-2, eu-west-1 has the broadest coverage of JumpStart models and instance types
# if you provision in other regions, you may not have access to all the models or instance types, and may need to request increase of quotas for some instance types
myRegion='us-east-1'

# parameter store
myParameterStoreEndpointName='doit-jumpstart-sagemaker-endpoint-name'
myParameterStoreIAMARN='doit-jumpstart-sagemaker-iam-arn'

# application auto scaling policy
myEndpointScalingPolicyName='doit-jumpstart-sagemaker-endpoint-scaling-policy'

print ('Done! Move to the next cell ->')

Done! Move to the next cell ->


In [16]:
# import libraries
import boto3
from certifi import where

botoSession = boto3.Session(region_name=myRegion)

# Configure boto3 to use certifi's certificates - helps avoid SSL errors if your system’s certificate store is out of date or missing root certs
sts_client = boto3.client('sts', verify=where())
myAccountNumber = sts_client.get_caller_identity()["Account"]
print(myAccountNumber)
print(sts_client.get_caller_identity()["Arn"])

# create clients we can use later
# iam
iam = boto3.client('iam', region_name=myRegion, verify=where())
# ssm
ssm = boto3.client('ssm', region_name=myRegion, verify=where())

print ('Done! Move to the next cell ->')

546709318047
arn:aws:iam::546709318047:user/simon-davies-cli
Done! Move to the next cell ->


In [17]:
# define tags added to all services we create
myTags = [
    {"Key": "env", "Value": "non_prod"},
    {"Key": "owner", "Value": "doit-jumpstart"},
    {"Key": "project", "Value": "lab1"},
    {"Key": "author", "Value": "simon"},
]
myTagsDct = {
    "env": "non_prod",
    "owner": "doit-jumpstart",
    "project": "lab1",
    "author": "simon",
}

print ('Done! Move to the next cell ->')

Done! Move to the next cell ->


# <span style="color:DarkSeaGreen">IAM</span>

In [18]:
def getSageMakerExecutionRole():
    """
    Gets a role required for SageMaker to run jobs on your behalf
    Only needed if this is being run in a local IDE, not needed if in SageMaker Studio or SageMaker Notebook Instance

    Args:
        None

    Returns:
        An IAM execution role ARN
    """

    # get the role we created in the previous lab from the parameter store
    response = ssm.get_parameter(Name=myParameterStoreIAMARN)
    myRoleSageMakerExecutionARN = response['Parameter']['Value']
    print(f"Retrieved role from parameter store: {myRoleSageMakerExecutionARN}")    

    return myRoleSageMakerExecutionARN

# <span style="color:DarkSeaGreen">Get Execution Role and Session</span>
- SageMaker requires an execution role to assume on your behalf

In [19]:
from sagemaker.session import get_execution_role
from sagemaker.session import Session

try:
    # if this is being run in a SageMaker AI JupyterLab Notebook
    myRoleSageMakerExecutionARN = get_execution_role()
except:
    # if this is being run in a local IDE - we need to create our own role
    myRoleSageMakerExecutionARN = getSageMakerExecutionRole()

# make sure we get a session in the correct region (needed as it can use the aws configure region if running this locally
sageMakerSession = Session(boto_session=botoSession)

print(myRoleSageMakerExecutionARN)
print(sageMakerSession)

print ('Done! Move to the next cell ->')

Couldn't call 'get_role' to get Role ARN from role name simon-davies-cli to get Role path.


Retrieved role from parameter store: arn:aws:iam::546709318047:role/doit-jumpstart-sagemaker-execution-role
arn:aws:iam::546709318047:role/doit-jumpstart-sagemaker-execution-role
<sagemaker.session.Session object at 0x10c6783e0>
Done! Move to the next cell ->


# <span style="color:DarkSeaGreen">Get the Endpoint from Lab 1</span>

In [20]:
# get the endpoint created in lab1
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# get the endpoint name from parameter store
response = ssm.get_parameter(
    Name=myParameterStoreEndpointName
)
endpointName = response['Parameter']['Value']
print(f"Using endpoint name: {endpointName}")  

# create a predictor to interact with the endpoint - need to specify the default serializer and deserializer this time
predictor = Predictor(
    endpoint_name=endpointName,
    sagemaker_session=sageMakerSession,
    serializer=JSONSerializer(),      
    deserializer=JSONDeserializer()
)

print ('Done! Move to the next cell ->')

Using endpoint name: meta-textgeneration-llama-2-7b-2025-09-23-07-37-06-134
Done! Move to the next cell ->


In [21]:
# test the endpoint
example_payloads = [
    {
        "body": {
            "inputs": "Write a Python function to check if a number is prime",
            "parameters": {"max_new_tokens": 128, "temperature": 0.2, "top_p": 0.9},
        },
        "content_type": "application/json",
        "accept": "application/json",
    },
    {
        "body": {
            "inputs": "Describe what a llm model can do for someone who is sceptical about them",
            "parameters": {"max_new_tokens": 128, "temperature": 0.2, "top_p": 0.9},
        },
        "content_type": "application/json",
        "accept": "application/json",
    },
]


for payload in example_payloads:
    body = payload.body if hasattr(payload, "body") else payload["body"]
    response = predictor.predict(body)
    response = response[0] if isinstance(response, list) else response
    print("Input:\n", body, end="\n\n")
    print("Output:\n", response["generated_text"].strip(), end="\n\n\n")

print ('Done! Move to the next cell ->')

Input:
 {'inputs': 'Write a Python function to check if a number is prime', 'parameters': {'max_new_tokens': 128, 'temperature': 0.2, 'top_p': 0.9}}

Output:
 Write a Python function to check if a number is prime or not.
Write a Python function to check if a number is prime or not. The function should take a number as input and return True if the number is prime and False otherwise.
Write a Python function to check if a number is prime or not. The function should take a number as input and return True if the number is prime and False otherwise. The function should use the Sieve of Eratosthenes algorithm to check if the number is prime or not.
The Sieve of Eratosthenes algorithm is a simple and efficient way to check if a number is prime or not. The algorithm works by


Input:
 {'inputs': 'Describe what a llm model can do for someone who is sceptical about them', 'parameters': {'max_new_tokens': 128, 'temperature': 0.2, 'top_p': 0.9}}

Output:
 Describe what a llm model can do for someo

# <span style="color:DarkSeaGreen">Create Scalability Plan</span>
- Uses SageMaker Application Auto Scaling
- Works especially well for generative AI models, which are typically concurrency-bound and can take many seconds to complete each inference request
- Using the new high-resolution metrics allow you to greatly decrease the time it takes to scale up an endpoint using Application Auto Scaling

In [27]:
# https://github.com/aws/amazon-sagemaker-examples/blob/main/inference/generativeai/huggingfacetgi/meta-llama/llama3-8b/faster-autoscaling/realtime-endpoints/FasterAutoscaling-SME-Llama3-8B-AppAutoScaling.ipynb
# https://aws.amazon.com/blogs/machine-learning/amazon-sagemaker-inference-launches-faster-auto-scaling-for-generative-ai-models/
# https://docs.aws.amazon.com/autoscaling/application/userguide/what-is-application-auto-scaling.html

# define a new auto scaling target for Application Auto Scaling
# auto scaling
autoScaling = boto3.client('application-autoscaling', region_name=myRegion, verify=where())
variantName = "AllTraffic"
ResourceId  = "endpoint/" + endpointName + "/variant/" + variantName

# Register scalable target
scalableTarget = autoScaling.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=ResourceId,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=1,
    MaxCapacity=2,  # Replace with your desired maximum instances
)

print ('Done! Move to the next cell ->')

Done! Move to the next cell ->


In [None]:
# create target tracking scaling policy
# this is a target tracking policy that uses the new high-resolution metrics for SageMaker endpoints
# you can also create a step-scaling policy if you prefer
# a step-scaling policy is more complex to set up, but gives you more control over how your endpoint scales

# create a policy that scales out when the endpoint receives more than n ConcurrentRequestsPerModel
# this new metric will be tracked when th predefined metric type used below is SageMakerVariantConcurrentRequestsPerModelHighResolution
targetTrackingPolicyResponse = autoScaling.put_scaling_policy(
    PolicyName=myEndpointScalingPolicyName,
    ServiceNamespace="sagemaker",
    ResourceId=ResourceId,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 5.0,  # Scaling triggers when endpoint receives 5 ConcurrentRequestsPerModel
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantConcurrentRequestsPerModelHighResolution"
        },
        "ScaleInCooldown": 180,  # Cooldown period after scale-in activity
        "ScaleOutCooldown": 180,  # Cooldown period after scale-out activity
    },
)

# print(target_tracking_policy_response)
print(f"[b]Policy ARN:[/b] [i blue]{targetTrackingPolicyResponse['PolicyARN']}")

# print Cloudwatch Alarms
alarms = targetTrackingPolicyResponse["Alarms"]

for alarm in alarms:
    print(f"[b]Alarm Name:[/b] [b magenta]{alarm['AlarmName']}")
    # print(f"[b]Alarm ARN:[/b] [i green]{alarm['AlarmARN']}[/i green]")
    print("===" * 15)

print ('Done! Move to the next cell ->')

Done! Move to the next cell ->


# <span style="color:DarkSeaGreen">Test Scalability Plan</span>
- Lets just test the endpoint first, make sure its all good
- Simulate load

In [36]:
# lets just test the endpoint again to make sure it still works
example_payloads = [
    {
        "body": {
            "inputs": "Please explain what load testing is and why its important in reference to sagemaker endpoints",
            "parameters": {"max_new_tokens": 128, "temperature": 0.2, "top_p": 0.9},
        },
        "content_type": "application/json",
        "accept": "application/json",
    },
]

for payload in example_payloads:
    body = payload.body if hasattr(payload, "body") else payload["body"]
    response = predictor.predict(body)
    response = response[0] if isinstance(response, list) else response
    print("Input:\n", body, end="\n\n")
    print("Output:\n", response["generated_text"].strip(), end="\n\n\n")

print ('Done! Move to the next cell ->')

Input:
 {'inputs': 'Please explain what load testing is and why its important in reference to sagemaker endpoints', 'parameters': {'max_new_tokens': 128, 'temperature': 0.2, 'top_p': 0.9}}

Output:
 Please explain what load testing is and why its important in reference to sagemaker endpoints.
Load testing is a type of performance testing that measures the behavior of a system under a specific load. It is important in reference to SageMaker endpoints because it helps to ensure that the system can handle the expected load and provides a baseline for future performance.
Load testing helps to identify any potential bottlenecks or issues that may arise when the system is under a heavy load. It also helps to ensure that the system is scalable and can handle an increase in traffic.
Load testing is important in reference to SageMaker endpoints because it helps to ensure that the system can handle the expected load


Done! Move to the next cell ->


In [44]:
# now we're going to use locust to simulate load on the endpoint
# https://docs.locust.io/en/stable/ 
# https://aws.amazon.com/blogs/machine-learning/best-practices-for-load-testing-amazon-sagemaker-real-time-inference-endpoints/
# see the locust_script_lab2.py file for details of the load test
# it gathers the endpoint name, etc via os environment vars we export below
# run this cell, then paste and run in a terminal window, make sure its run in your virtual environment created in lab 1, or in your own that has boto3 and locust installed

print ("export AWS_REGION={}".format(myRegion))
print ("export ENDPOINT_NAME={}".format(endpointName))
print ("export CONTENT_TYPE={}".format("application/json"))
print ("export PAYLOAD='{}'".format('{"inputs": "I am going to siulate a load test on your endpoint in a few minutes. What do you think of that?"}'))
print ("export HOST={}".format('http://localhost')) # locust needs a host, but we don't use it

export AWS_REGION=us-east-1
export ENDPOINT_NAME=meta-textgeneration-llama-2-7b-2025-09-23-07-37-06-134
export CONTENT_TYPE=application/json
export PAYLOAD='{"inputs": "I am going to siulate a load test on your endpoint in a few minutes. What do you think of that?"}'
export HOST=http://localhost


In [None]:
# these are picked up by the locust file
# Paste the following in a terminal window, make sure its run in your virtual environment created in lab 1, or in your own that has boto3 and locust installed
# LOCUST_USERS is the number of simulated users
# LOCUST_SPAWN_RATE is the rate per second to spawn (add new) users - so 20 users at rate of 2 means add 2 users every second, so take 10 seconds to get to 20 users
# LOCUST_RUN_TIME is how long to run the test for
export LOCUST_USERS=20
export LOCUST_SPAWN_RATE=2
export LOCUST_RUN_TIME=10m

In [None]:
# now we're going to use locust to simulate load on the endpoint
# https://docs.locust.io/en/stable/ 
# https://aws.amazon.com/blogs/machine-learning/best-practices-for-load-testing-amazon-sagemaker-real-time-inference-endpoints/
# see the locust_script_lab2.py file for details of the load test
# it gathers the endpoint name, etc from the parameter store where we stored it in lab 1

# Paste the following in a terminal window, make sure its run in your virtual environment created in lab 1, or in your own that has boto3 and locust installed
locust -f locust_script_lab2.py --headless -u $LOCUST_USERS -r $LOCUST_SPAWN_RATE --run-time $LOCUST_RUN_TIME --host http://localhost

# <span style="color:DarkSeaGreen">White Locust is Running</span>
- Go to the CloudWatch console
- Monitor the alarm being target tracked for ConcurrentRequestsPerModel, eg 
  - TargetTracking-endpoint/*endpoint name*-Alarm**High**-*uuid*
  - TargetTracking-endpoint/*endpoint name*-Alarm**Low**-*uuid*
- Run the cell below to monitor the instance count



In [49]:
import time
from datetime import datetime, timedelta

# --- Configuration ---
endpoint_name = endpointName
region = myRegion
poll_interval = 10    # seconds between checks

# --- Clients ---
sm_client = boto3.client("sagemaker", region_name=region)
cw_client = boto3.client("cloudwatch", region_name=region)

print(f"Monitoring endpoint '{endpoint_name}' variants (press Ctrl+C to stop)...\n")

try:
    while True:
        # --- Describe endpoint variants ---
        response = sm_client.describe_endpoint(EndpointName=endpoint_name)
        for variant in response["ProductionVariants"]:
            variant_name = variant["VariantName"]
            current_instances = variant["CurrentInstanceCount"]
            desired_instances = variant["DesiredInstanceCount"]

            # --- Fetch latest ConcurrentRequestsPerModel metric ---
            end_time = datetime.utcnow()
            start_time = end_time - timedelta(seconds=poll_interval*2)  # small window to get the latest datapoint

            metric_resp = cw_client.get_metric_statistics(
                Namespace="AWS/SageMaker",
                MetricName="ConcurrentRequestsPerModel",
                Dimensions=[
                    {"Name": "EndpointName", "Value": endpoint_name},
                    {"Name": "VariantName", "Value": variant_name},
                ],
                StartTime=start_time,
                EndTime=end_time,
                Period=poll_interval,
                Statistics=["Average"],
            )

            datapoints = metric_resp.get("Datapoints", [])
            concurrent_requests = round(datapoints[-1]["Average"], 2) if datapoints else 0

            print(
                f"[{time.strftime('%H:%M:%S')}] Variant: {variant_name} | "
                f"Current instances: {current_instances} | Desired instances: {desired_instances} | "
                f"ConcurrentRequestsPerModel: {concurrent_requests}"
            )

        print("-" * 80)
        time.sleep(poll_interval)

except KeyboardInterrupt:
    print("Monitoring stopped.")


Monitoring endpoint 'meta-textgeneration-llama-2-7b-2025-09-23-07-37-06-134' variants (press Ctrl+C to stop)...



  end_time = datetime.utcnow()


[18:21:13] Variant: AllTraffic | Current instances: 1 | Desired instances: 1 | ConcurrentRequestsPerModel: 0.0
--------------------------------------------------------------------------------
[18:21:24] Variant: AllTraffic | Current instances: 1 | Desired instances: 1 | ConcurrentRequestsPerModel: 0.0
--------------------------------------------------------------------------------
[18:21:36] Variant: AllTraffic | Current instances: 1 | Desired instances: 1 | ConcurrentRequestsPerModel: 0.0
--------------------------------------------------------------------------------
[18:21:47] Variant: AllTraffic | Current instances: 1 | Desired instances: 1 | ConcurrentRequestsPerModel: 20.0
--------------------------------------------------------------------------------
[18:21:59] Variant: AllTraffic | Current instances: 1 | Desired instances: 1 | ConcurrentRequestsPerModel: 20.0
--------------------------------------------------------------------------------
[18:22:10] Variant: AllTraffic | Curre

# <span style="color:DarkSeaGreen">SageMaker Inference Recommender</span>
- Helps you select the best instance type and configuration for your ML models and workloads
- https://docs.aws.amazon.com/sagemaker/latest/dg/inference-recommender.html
- NOTE NEEDS TO BE INVESTIGATED TO SEE IF IT WORKS WITH LLM REAL TIME ENDPOINTS

# <span style="color:DarkSeaGreen">Clean Up Architecture</span>
### <span style="color:Red">Only do this if you have finished with this lab and any labs that depend on it!</span>
##### It will delete all architecture created, make sure you no longer need any of it!!!

In [None]:
# when finished with the endpoint, delete it
# endpoint is deleted in lab 1
# remmeber to delete the architecture in lab 1 too