# re:Invent 2025 Workshop
# AIM406 - Build agentic workflows with Small Language Models and SageMaker AI
# Lab 1 - Model deployment

Leverage open-source models such as Qwen3 to build AI agents with frameworks like Langraph and Strands. Create autonomous agents that reason, plan, and execute complex tasks through advanced prompt engineering and state management. Get hands-on experience with tool calling and MCP integration patterns to manage context between different tools, enabling seamless multi-step workflows. Learn to use Bedrock AgentCore to deploy agents for real-time use cases with security, observability and scale. Integration with LLMs deployed in SageMaker AI, Bedrock AgentCore, and open-source frameworks, learn to build production-ready AI agents that deliver measurable business value while maintaining security, scalability, and cost efficiency.

![AWS Agentic Portfolio](img/aim406_agentic_portfolio.png)

## Deploy a model on Amazon SageMaker AI endpoint

### Setup environment

In [None]:
%pip install sagemaker --upgrade --quiet --no-warn-conflicts

In [None]:
import json
import sagemaker
import boto3

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name  # region name of the current SageMaker Studio environment

sm_client = boto3.client("sagemaker")  # client to intreract with SageMaker
smr_client = boto3.client("sagemaker-runtime")  # client to intreract with SageMaker Endpoints

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
print(f"sagemaker version: {sagemaker.__version__}")

### Create a SageMaker model object and deploy it on endpoint

![SageMaker Endpoint](img/aim406_sagemaker_endpoint.png)

Select one of the available Large Model Inference (LMI) container images for hosting. In this workshop we are going to use latest (at the time of workshop) LMI v16 (`0.34.0-lmi16.0.0-cu-128`) container image. Ensure that you are using the image URI for the region that corresponds with your deployment region.

When using LMI container we can set inference parameters using environment variables.

In [None]:
CONTAINER_VERSION = "0.34.0-lmi16.0.0-cu128"
inference_image = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:{CONTAINER_VERSION}"

model_id = "Qwen/Qwen3-4B"
instance = {"type": "ml.g5.2xlarge", "num_gpu": 1}

model_name = sagemaker.utils.name_from_base("model", short=True)
endpoint_name = model_name
endpoint_config_name = model_name

timeout = 600

common_env = {
    "HF_MODEL_ID": model_id,
}
lmi_env = {
    "SERVING_FAIL_FAST": "true",
    "OPTION_ASYNC_MODE": "true",
    "OPTION_ROLLING_BATCH": "disable",
    "OPTION_ENTRYPOINT": "djl_python.lmi_vllm.vllm_async_service",
    "OPTION_TENSOR_PARALLEL_DEGREE": json.dumps(instance["num_gpu"]),
    "OPTION_MAX_MODEL_LEN": "16384",
    "OPTION_TRUST_REMOTE_CODE": "true",
    "OPTION_ENABLE_AUTO_TOOL_CHOICE": "true",
    "OPTION_TOOL_CALL_PARSER": "hermes", #"qwen3_xml"
}
env = common_env | lmi_env

#### Deploy the model

In [None]:
lmi_model = sagemaker.Model(
    image_uri=inference_image,
    env=env,
    role=role,
    name=model_name,
)

lmi_model.deploy(
    initial_instance_count=1,
    instance_type=instance["type"],
    container_startup_health_check_timeout=timeout,
    endpoint_name=endpoint_name,
)

llm = sagemaker.Predictor(
    endpoint_name = endpoint_name,
    sagemaker_session = sess,
    serializer = sagemaker.serializers.JSONSerializer(),
    deserializer = sagemaker.deserializers.JSONDeserializer(),
)

#### View logs for the base inference component (and adapters after they're loaded)

In [None]:
import urllib

cw_path = urllib.parse.quote_plus(f'/aws/sagemaker/Endpoints/{endpoint_name}', safe='', encoding=None, errors=None)

print(f'You can view your inference component logs here:\n\n https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logsV2:log-groups/log-group/{cw_path}')

### Test inference

In [None]:
payload={
    "messages": [
        {"role": "user", "content": "What is bigger 9.11 or 9.8?"}
    ],
}
res = llm.predict(payload)
print("-----\n" + res["choices"][0]["message"]["content"] + "\n-----\n")
print(res["usage"])

In [None]:
import io
import json
import time
import boto3
from IPython.display import clear_output

class LineIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if "PayloadPart" not in chunk:
                print("Unknown event type:" + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])

def stream_response(endpoint_name, inputs, max_tokens=8189, temperature=0.7, top_p=0.9):
    body = {
      "messages": [
        {"role": "user", "content": [{"type": "text", "text": inputs}]}
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "stream": True,
    }

    resp = smr_client.invoke_endpoint_with_response_stream(
        EndpointName=endpoint_name,
        Body=json.dumps(body),
        ContentType="application/json",
    )

    event_stream = resp["Body"]
    start_json = b"{"
    full_response = ""
    start_time = time.time()
    token_count = 0

    for line in LineIterator(event_stream):
        if line != b"" and start_json in line:
            data = json.loads(line[line.find(start_json):].decode("utf-8"))
            token_text = data['choices'][0]['delta'].get('content', '')
            full_response += token_text
            token_count += 1

            # Calculate tokens per second
            elapsed_time = time.time() - start_time
            tps = token_count / elapsed_time if elapsed_time > 0 else 0

            # Clear the output and reprint everything
            clear_output(wait=True)
            print(full_response)
            print(f"\nTokens per Second: {tps:.2f}", end="")

    print("\n") # Add a newline after response is complete

    return full_response

In [None]:
inputs = "What is greater 9.11 or 9.8?"
output = stream_response(endpoint_name, inputs, max_tokens=8000)

## (Optional) Cleanup

#### DO NOT RUN THE CELL BELOW IF YOU WANT TO RUN LAB2 IN THE WORKSHOP

In [None]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
sess.delete_model(model_name)