#### 1. Serving Preparation
##### 
##### 1.1 Configure Workspace details
To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.



In [2]:
import sys
import os
import yaml
from datetime import datetime
snapshot_date = datetime.now().strftime("%Y-%m-%d")

sys.path.append(os.path.abspath(os.path.join('..')))
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))

config_file = "/llama-fc-wo-descriptions_config.yaml"

with open(f'./{config_file}') as f:
    d = yaml.load(f, Loader=yaml.FullLoader)
    
AZURE_SUBSCRIPTION_ID = d['config']['AZURE_SUBSCRIPTION_ID']
AZURE_RESOURCE_GROUP = d['config']['AZURE_RESOURCE_GROUP']
AZURE_WORKSPACE = d['config']['AZURE_WORKSPACE']
AZURE_DATA_NAME = d['config']['AZURE_SFT_DATA_NAME']    
DATA_DIR = d['config']['SFT_DATA_DIR']
CLOUD_DIR = d['config']['CLOUD_DIR']
HF_MODEL_NAME_OR_PATH = d['config']['HF_MODEL_NAME_OR_PATH']
IS_DEBUG = d['config']['IS_DEBUG']
USE_LOWPRIORITY_VM = d['config']['USE_LOWPRIORITY_VM']

azure_env_name = d['serve']['azure_env_name']
azure_compute_cluster_name = d['serve']['azure_compute_cluster_name']
azure_compute_cluster_size = d['serve']['azure_serving_cluster_size']

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CLOUD_DIR, exist_ok=True)

In [3]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)  # Set this to the lowest level you want to capture

# Create console handler with a higher log level
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)  # Set this to the lowest level you want to capture

# Create file handler which logs even debug messages
file_handler = logging.FileHandler("debug.log")
file_handler.setLevel(logging.DEBUG)  # Set this to the lowest level you want to capture

# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

In [4]:
logger.info("===== 0. Azure ML Training Info =====")
logger.info(f"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}")
logger.info(f"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}")
logger.info(f"AZURE_WORKSPACE={AZURE_WORKSPACE}")
logger.info(f"AZURE_DATA_NAME={AZURE_DATA_NAME}")
logger.info(f"DATA_DIR={DATA_DIR}")
logger.info(f"CLOUD_DIR={CLOUD_DIR}")
logger.info(f"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}")
logger.info(f"IS_DEBUG={IS_DEBUG}")
logger.info(f"USE_LOWPRIORITY_VM={USE_LOWPRIORITY_VM}")
logger.info(f"azure_env_name={azure_env_name}")
logger.info(f"azure_compute_cluster_name={azure_compute_cluster_name}")
logger.info(f"azure_compute_cluster_size={azure_compute_cluster_size}")

2025-01-08 06:46:23,913 - __main__ - INFO - ===== 0. Azure ML Training Info =====
2025-01-08 06:46:23,914 - __main__ - INFO - AZURE_SUBSCRIPTION_ID=8cebb108-a4d5-402b-a0c4-f7556126277f
2025-01-08 06:46:23,915 - __main__ - INFO - AZURE_RESOURCE_GROUP=azure-ml-priya-demo
2025-01-08 06:46:23,916 - __main__ - INFO - AZURE_WORKSPACE=azure-ml-priya-westus3
2025-01-08 06:46:23,918 - __main__ - INFO - AZURE_DATA_NAME=sft-data-function-call-wo-desc
2025-01-08 06:46:23,918 - __main__ - INFO - DATA_DIR=./dataset_wo_desc
2025-01-08 06:46:23,920 - __main__ - INFO - CLOUD_DIR=./cloud
2025-01-08 06:46:23,920 - __main__ - INFO - HF_MODEL_NAME_OR_PATH=unsloth/Llama-3.2-3B-Instruct
2025-01-08 06:46:23,921 - __main__ - INFO - IS_DEBUG=True
2025-01-08 06:46:23,922 - __main__ - INFO - USE_LOWPRIORITY_VM=False
2025-01-08 06:46:23,923 - __main__ - INFO - azure_env_name=slm-serving-llama
2025-01-08 06:46:23,924 - __main__ - INFO - azure_compute_cluster_name=gpu-a100-demo-vm
2025-01-08 06:46:23,926 - __main__ 

In [5]:
# import required libraries
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component
from azure.ai.ml import command
from azure.ai.ml.entities import Data, Environment, BuildContext
from azure.ai.ml.entities import Model
from azure.ai.ml import Input
from azure.ai.ml import Output
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError

credential = DefaultAzureCredential()
ml_client = None
try:
    ml_client = MLClient.from_config(credential)
except Exception as ex:
    print(ex)
    ml_client = MLClient(credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE)

Found the config file in: /config.json


##### 1.2 Create Model asset

In [6]:
def get_or_create_model_asset(ml_client, model_name, job_name, model_dir="outputs", model_type="custom_model", update=False):
    
    try:
        if update:
            raise ResourceExistsError('Found Model asset, but will update the Model.')
        else:
            latest_model_version = max([int(m.version) for m in ml_client.models.list(name=model_name)])
            model_asset = ml_client.models.get(name=model_name, version=latest_model_version)
            logger.info(f"Found Model asset: {model_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")        
        model_path = f"azureml://jobs/{job_name}/outputs/artifacts/paths/{model_dir}/"    
        run_model = Model(
            name=model_name,        
            path=model_path,
            description="Model created from run.",
            type=model_type # mlflow_model, custom_model, triton_model
        )
        model_asset = ml_client.models.create_or_update(run_model)
        logger.info(f"Created Model asset: {model_name}")

    return model_asset

In [7]:
model_dir = d['train']['model_dir']
model = get_or_create_model_asset(ml_client, d["serve"]["azure_model_name"], "<job_name>", model_dir, model_type="custom_model", update=False)

# model = get_or_create_model_asset(ml_client, d['serve']['azure_model_name'], update = False)

2025-01-08 06:46:58,633 - __main__ - INFO - Found Model asset: llama-fc-wo-descriptions-ft. Will not create again


#### 1.3 Create AzureML environment
####
Azure ML defines containers (called environment asset) in which your code will run. You can use a pre-built enviornment or create a custom enviornment. For this hands-on session, we will buid a custom Docker enviornment

In [12]:
%%writefile {CLOUD_DIR}/serve/Dockerfile

FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu124-py310-torch241:biweekly.202410.2

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/

RUN /var/requirements/install_system_requirements.sh && \
    cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
    cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
    ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
    rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client

RUN MAX_JOBS=4 pip install flash-attn==2.6.3 --no-build-isolation

Writing ./cloud/serve/Dockerfile


In [13]:
%%writefile {CLOUD_DIR}/serve/requirements.txt
azureml-core==1.58.0
azureml-dataset-runtime==1.58.0
azureml-defaults==1.58.0
azure-ml==0.0.1
azure-ml-component==0.9.18.post2
azureml-mlflow==1.58.0
azureml-contrib-services==1.58.0
azureml-contrib-services==1.58.0
azureml-automl-common-tools==1.58.0
torch-tb-profiler==0.4.3
azureml-inference-server-http~=1.3
inference-schema==1.8.0
MarkupSafe==3.0.2
regex
pybind11
bitsandbytes==0.44.1
transformers==4.46.1
peft==0.13.2
accelerate==1.1.0
datasets
scipy
azure-identity
packaging==24.1
timm==1.0.11
einops

Writing ./cloud/serve/requirements.txt


In [8]:
from azure.ai.ml.entities import Environment, BuildContext

def get_or_create_docker_environment_asset(ml_client, env_name, docker_dir, update=False):
    
    try:
        latest_env_version = max([int(e.version) for e in ml_client.environments.list(name=env_name)])
        if update:
            raise ResourceExistsError('Found Environment asset, but will update the Environment.')
        else:
            env_asset = ml_client.environments.get(name=env_name, version=latest_env_version)
            print(f"Found Environment asset: {env_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        print(f"Exception: {e}")
        env_docker_image = Environment(
            build=BuildContext(path=docker_dir),
            name=env_name,
            description="Environment created from a Docker context.",
        )
        env_asset = ml_client.environments.create_or_update(env_docker_image)
        print(f"Created Environment asset: {env_name}")
    
    return env_asset

env = get_or_create_docker_environment_asset(ml_client, "slm-serving-florence", f"{CLOUD_DIR}/inference", update=False)

Found Environment asset: slm-serving-florence. Will not create again


#### 1.4 Serving script
####
If you are not serving using a MLflow model but instead using a custom model, you can write your own script. This step demosntrates how to write the scoring script to run the inference.

The scoring script consists of two components:

1. init() : This is where you define the global initialization logic like loading of LLM models and tokenizers
2. run() : Inference logic called for every invocation of the endpoint

In [24]:
%%writefile {CLOUD_DIR}/serve/score.py
import os
import re
import json
import torch
import base64
import logging

from io import BytesIO
from transformers import AutoTokenizer, AutoProcessor, pipeline
from transformers import AutoModelForCausalLM, AutoProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def init():
    """
    This function is called when the container is initialized/started, typically after create/update of the deployment.
    You can write the logic here to perform init operations like caching the model in memory
    """
    global model
    global tokenizer
    # AZUREML_MODEL_DIR is an environment variable created during deployment.
    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
    # Please provide your model's folder name if there is one
    model_name_or_path = os.path.join(
        os.getenv("AZUREML_MODEL_DIR"), "{{score_model_dir}}"
    )
    
    model_kwargs = dict(
        trust_remote_code=True,    
        device_map={"":0},
        torch_dtype="auto" 
    )
    
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)    

    logging.info("Loaded model.")
    
def run(json_data: str):
    logging.info("Request received")
    data = json.loads(json_data)
    input_data = data["input_data"]
    params = data['params']

    # pipe = pipeline("text-generation", model = model, tokenizer = tokenizer)
    # output = pipe(input_data, **params)
    # result = output[0]["generated_text"]
    # logging.info(f"Generated text : {result}")
    inputs = tokenizer.apply_chat_template(input_data, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, do_sample = True, temperature = 0.1)
    result = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens = True)

    json_result = {"result" : result}

    return json_result

Overwriting ./cloud/inference/score.py


Plug in the appropriate variable in the inference script

In [25]:
import jinja2
from pathlib import Path
TRAINED_MLFLOW = False

jinja_env = jinja2.Environment()  

template = jinja_env.from_string(Path(f"{CLOUD_DIR}/inference/score.py").open().read())
score_model_dir = model_dir.split("/")[-1]

Path(f"{CLOUD_DIR}/inference/score.py").open("w").write(
    template.render(score_model_dir=score_model_dir)
)

1986

#### 2. Serving
#####
##### 2.1 Create the endpoint
Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model.
 
Note : This step doesn't provision the GPU

In [9]:
%%time
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    IdentityConfiguration,
    ManagedIdentityConfiguration,
)

azure_endpoint_name = d['serve']['azure_endpoint_name']
# Check if the endpoint already exists in the workspace
try:
    endpoint = ml_client.online_endpoints.get(azure_endpoint_name)
    print("---Endpoint already exists---")
except:
    # Create an online endpoint if it doesn't exist

    # Define the endpoint
    endpoint = ManagedOnlineEndpoint(
        name=azure_endpoint_name,
        description=f"Test endpoint for {model.name}",
    )

# Trigger the endpoint creation
try:
    ml_client.begin_create_or_update(endpoint).wait()
    print("\n---Endpoint created successfully---\n")
except Exception as err:
    raise RuntimeError(
        f"Endpoint creation failed. Detailed Response:\n{err}"
    ) from err


---Endpoint created successfully---

CPU times: user 70.3 ms, sys: 9.14 ms, total: 79.4 ms
Wall time: 1min 33s


#### 3.2 Create the Deployment
This process takes lot of time as GPU clusters needs to be provisioned and serving environment must be built

In [26]:
%%time
from azure.ai.ml.entities import (    
    OnlineRequestSettings,
    CodeConfiguration,
    ManagedOnlineDeployment,
    ProbeSettings,
    Environment
)

azure_deployment_name = f"{d['serve']['azure_deployment_name']}"

deployment = ManagedOnlineDeployment(
    name=azure_deployment_name,
    endpoint_name=azure_endpoint_name,
    model=model,
    instance_type=azure_compute_cluster_size,
    instance_count=1,
    #code_configuration=code_configuration,
    environment = env,
    scoring_script="score.py",
    code_path=f"./{CLOUD_DIR}/serve",
    #environment_variables=deployment_env_vars,
    request_settings=OnlineRequestSettings(max_concurrent_requests_per_instance=50,
                                           request_timeout_ms=90000, max_queue_wait_ms=60000),
    liveness_probe=ProbeSettings(
        failure_threshold=30,
        success_threshold=1,
        period=100,
        initial_delay=500,
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=30,
        success_threshold=1,
        period=100,
        initial_delay=500,
    ),
)

# Trigger the deployment creation
try:
    ml_client.begin_create_or_update(deployment).wait()
    print("\n---Deployment created successfully---\n")
except Exception as err:
    raise RuntimeError(
        f"Deployment creation failed. Detailed Response:\n{err}"
    ) from err
    
endpoint.traffic = {azure_deployment_name: 100}
endpoint_poller = ml_client.online_endpoints.begin_create_or_update(endpoint)  

Check: endpoint llama-endpoint-ft exists
[32mUploading inference (0.0 MBs): 100%|██████████| 2301/2301 [00:00<00:00, 12618.12it/s]
[39m



.......................................................................................................

#### 3. Inference
##### Test invocation
Run inference on managed endpoint using sample data

In [5]:
import json
import os 

sample = {
    "input_data": 
        [
            {'role': 'system', 'content': 'You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed- { "name": "calculate_shipping_cost", "description": "Calculate the cost of shipping a package", "parameters": { "type": "object", "properties": { "weight": { "type": "number", "description": "The weight of the package in pounds" }, "destination": { "type": "string", "description": "The destination of the package" } }, "required": [ "weight", "destination" ] }}}"'},
            {'role': 'user', 'content': 'Can you help me with shipping cost for a package?'},
            {'role': 'assistant', 'content': 'Sure! I can help you with that. Please provide me with the weight and destination of the package.'},
            {'role': 'user', 'content': 'The weight of the package is 10 pounds and the destination is New York.'}
        ],
    "params": {
        "temperature": 0.1,
        "max_new_tokens": 512,
        "do_sample": True,
        "return_full_text": False
    }
}

# sample = {
#     "input_data": 
#         [
#             {"role": "user", "content": "Tell me Microsoft's brief history."},
#             {"role": "assistant", "content": "Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975, to develop and sell a BASIC interpreter for the Altair 8800."},
#             {"role": "user", "content": "What about Amazon's history?"}
#         ],
#     "params": {
#         "temperature": 0.1,
#         "max_new_tokens": 128,
#         "do_sample": True,
#         "return_full_text": False
#     }
# }

test_inference_dir = "./inference"
os.makedirs(test_inference_dir, exist_ok=True)

request_file = os.path.join(test_inference_dir, "sample_request.json")

with open(request_file, "w") as f:
    json.dump(sample, f)



In [21]:
azure_endpoint_name = d['serve']['azure_endpoint_name']
azure_deployment_name = f"{d['serve']['azure_deployment_name']}"

result = ml_client.online_endpoints.invoke(
    endpoint_name=azure_endpoint_name,
    deployment_name=azure_deployment_name,
    request_file=request_file
)

result_json = json.loads(result)
result = result_json['result']

print(result)

{'tool_uses': [{'recipient_name': 'functions.calculate_shipping_cost', 'parameters': {'weight': 10, 'destination': 'New York'}}]}


### 