# Deploy `llama guard v2` to AML Endpoint - dynamic model load

Deploy llama guard without model - model loads dinamically when job starts

## Connect to Azure Machine Learning Workspace


In [17]:
# import required libraries
from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Environment,
    CodeConfiguration
)
from azure.identity import DefaultAzureCredential

In [18]:
# get a handle to the workspace

ml_client = MLClient.from_config(credential=DefaultAzureCredential())

Found the config file in: /afh/projects/cba-19f78871-1b4d-4995-a55d-9bb46faef344/config.json


## Create the endpoint in Azure


In [19]:
# Define an endpoint name

import uuid
endpoint_name = "llama-guard-2-8b" + str(uuid.uuid4())[:4]

endpoint = ManagedOnlineEndpoint(name=endpoint_name)

endpoint = ml_client.begin_create_or_update(endpoint).result()

## Define the deployment

A deployment is a set of resources required for hosting the model that does the actual inferencing.

This also creates a one-off enviroment based on an existing docker image and a conda file



In [20]:
# request_settings = OnlineRequestSettings(
#     request_timeout_ms=180000  # Timeout in milliseconds
# )

In [21]:
deployment_name = "inference"
deployment = ManagedOnlineDeployment(
    name=deployment_name,
    endpoint_name=endpoint_name,
    code_configuration=CodeConfiguration(
        code="./src", scoring_script="score.py"
    ),
    environment=Environment(
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
        conda_file="conda.yaml",
    ),
    instance_type="Standard_NC24ads_A100_v4",
    instance_count=1,
)

## Create the deployment


In [22]:
deployment = ml_client.online_deployments.begin_create_or_update(deployment).result()

Check: endpoint llama-guard-2-8baa86 exists


....................................................................................

## Assign traffic to the deployment



In [None]:
endpoint.traffic = {deployment_name: 100}
endpoint = ml_client.begin_create_or_update(endpoint).result()

## Get the endpoint API

In [None]:
API_URI = endpoint.scoring_uri
print(f"API URI: {API_URI}")

## Check the endpoint on the deployment

Go to https://aml.azure.com, find your **Endpoint** -> **Consume** and get the key.

Create a `.env` file and put the following:

```bash
API_KEY=<<get api key from endpoint in aml>>
API_URI=<<the API_URI you got above .. you can also get it from **Endpoint** -> **Consume** >> 
```

# Define inference

In [None]:
import requests
import os
from dotenv import load_dotenv
load_dotenv()


API_URI = os.getenv("API_URI")
API_KEY = os.getenv("API_KEY")


def inference(data,url=API_URI,api_key=API_KEY):


    # Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
    
    if not api_key:
        raise Exception("A key should be provided to invoke the endpoint")

    headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + api_key}

    response = requests.post(url, json=data, headers=headers)

    if response.status_code == 200:
        result = response.json()
        return result
    else:
        print("The request failed with status code: " + str(response.status_code))
        print(response.headers)
        print(response.text)
        return None


## Test with safe data

In [None]:
data = {
  "conversation": [
    {
      "role": "user",
      "content": "I forgot, how do I kill a process in Linux?"
    },
    {
      "role": "assistant",
      "content": "Sure! To kill a process in Linux, you can use the kill command followed by the process ID (PID) of the process you want to terminate."
    }
  ]
}
inference(data)

## Test with Unsafe data

In [None]:
data = {
  "conversation": [
    {
      "role": "user",
      "content": "I want to kill someone?"
    },
    {
      "role": "assistant",
      "content": "Here is how you kill a person: you can shoot them"
    }
  ]
}
inference(data)