# Deploy `llama guard v2` to AML Endpoint - with model

Deploy llama guard with model - (model is first registered in AML)

> This model is gated.
>
> Before you start, please:
> - go to [https://huggingface.co/meta-llama/Meta-Llama-Guard-2-8B](https://huggingface.co/meta-llama/Meta-Llama-Guard-2-8B)
> - use your Hugging Face login to log in
> - request access to the model (it may take a few hours to get it)
> - once access has been granted, create a `token`
> - create the file `./src/.env` 
> - add the following to it:
>   ```
>   HUGGINGFACE_TOKEN=hf_* # your token
>   ```
> - Now proceed with the instructions below

In [None]:
# %pip install sentence-transformers

## Import model from Hugging Face

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv("./src/.env")

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

# login(token=HUGGINGFACE_TOKEN)

model_id = "meta-llama/Meta-Llama-Guard-2-8B"
device = "cuda"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)


## Save model

In [None]:
model.save_pretrained('model/model')
tokenizer.save_pretrained('./model/tokenizer')

## Connect to Azure Macine Learning Workspace

In [None]:

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential


ml_client = MLClient.from_config(credential=DefaultAzureCredential())


print(ml_client.workspace_name)


## Register the model

In [None]:
from azure.ai.ml.entities import Model

model = Model(
    path="./model",
    name="Meta-Llama-Guard-2-8B",
    description="./Meta-Llama-Guard-2-8B model"
)
ml_client.models.create_or_update(model)


## Create the Endpoint

In [None]:
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, CodeConfiguration

import uuid
endpoint_name =  "llama-guard-2-8b"  + str(uuid.uuid4())[:4]

endpoint = ManagedOnlineEndpoint(name=endpoint_name)

endpoint = ml_client.begin_create_or_update(endpoint).result()


## Define the deployment (the real thing)

In [None]:
from azure.ai.ml.entities import (
    Environment
)
deployment_name = "inference"
deployment = ManagedOnlineDeployment(
    name=deployment_name,
    endpoint_name=endpoint_name,
    model=model,
    code_configuration=CodeConfiguration(
        code="./src", scoring_script="score.py"
    ),
    environment=Environment(
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
        conda_file="conda.yaml",
    ),
    instance_type="Standard_NC24ads_A100_v4",
    instance_count=1,
)

## Create the deployment

In [None]:
ml_client.online_deployments.begin_create_or_update(deployment).result()

## Assign Traffic to deployment

In [None]:
endpoint.traffic = {deployment_name: 100}
endpoint = ml_client.begin_create_or_update(endpoint).result()

## Get the endpoint URL

In [None]:
API_URI = endpoint.scoring_uri
print(f"API URI: {API_URI}")

## Check the endpoint on the deployment

Go to https://aml.azure.com, find your **Endpoint** -> **Consume** and get the key.

Create a `.env` file and put the following:

```bash
API_KEY=<<get api key from endpoint in aml>>
API_URI=<<the API_URI you got above .. you can also get it from **Endpoint** -> **Consume** >> 
```

## Define Inference

In [None]:
import requests
import os
from dotenv import load_dotenv
load_dotenv(".env")



API_URI = os.getenv("API_URI")
API_KEY = os.getenv("API_KEY")


def inference(data,url=API_URI,api_key=API_KEY):


    # Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
    
    if not api_key:
        raise Exception("A key should be provided to invoke the endpoint")

    headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + api_key}

    response = requests.post(url, json=data, headers=headers)

    if response.status_code == 200:
        result = response.json()
        return result
    else:
        print("The request failed with status code: " + str(response.status_code))
        print(response.headers)
        print(response.text)
        return None


## Test safe data

In [None]:
data = {
  "conversation": [
    {
      "role": "user",
      "content": "I forgot, how do I kill a process in Linux?"
    },
    {
      "role": "assistant",
      "content": "Sure! To kill a process in Linux, you can use the kill command followed by the process ID (PID) of the process you want to terminate."
    }
  ]
}
inference(data)

## Test Unsafe data

In [None]:
data = {
  "conversation": [
    {
      "role": "user",
      "content": "I want to kill someone?"
    },
    {
      "role": "assistant",
      "content": "Here is how you kill a person: you can shoot them"
    }
  ]
}
inference(data)