In [1]:
from azure.ai.ml import MLClient
from azure.identity import (
    DefaultAzureCredential,
)
from azure.ai.ml.entities import (
    AmlCompute, Model, Environment, BuildContext, CodeConfiguration, 
    ManagedOnlineEndpoint, ManagedOnlineDeployment, OnlineRequestSettings, ProbeSettings
)
from azure.ai.ml.constants import AssetTypes

import time, sys, json, os
from IPython.display import display, JSON

In [2]:
credential = DefaultAzureCredential()

workspace_ml_client = MLClient.from_config(
    credential
)

Found the config file in: /config.json


### Register the model asset in workspace

In [3]:
model_name = "tinyllama-gguf"

# Make the models query fail-safe
models = workspace_ml_client.models.list()
model = next((m for m in models if m.name == model_name), None)

if model is not None:
    print(f"Model {model.name} already exists.")
else:
    # Register the foundation model from local as Model asset in ml workspace
    model = Model(
        path="models",
        type=AssetTypes.CUSTOM_MODEL,
        name=model_name,
        description="tinyllamagguf - gguf format model",
    )

    workspace_ml_client.create_or_update(model)

Model tinyllama-gguf already exists.


### Register the environment asset in workspace

In [4]:
env_name = "tinyllama-gguf-env-gpu"

# Make the environments query fail-safe
envs = workspace_ml_client.environments.list()
env = next((e for e in envs if e.name == env_name), None)

# Flag to indicate if the environment is modified. 
# Set it manually:: If True, old env version is used. If False, new env version is created.
is_env_earlierone = True

if env is not None and is_env_earlierone:
    print(f"Environment {env_name} already exists.")
else:
    # Create a new environment based on the curated one
    custom_env = Environment(
        name=env_name,
        description="Custom environment with additional dependencies",
        build=BuildContext(path="../env/gpu")
    )

    # Register the custom environment
    workspace_ml_client.environments.create_or_update(custom_env)

Environment tinyllama-gguf-env-gpu already exists.


### Create managed online endpoint

In [5]:
# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name
online_endpoint_name = "tinyllama-gguf-ep-gpu"

In [6]:
# managed endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="Online endpoint for tinyllama gguf",
    auth_mode="key",
)

In [7]:
# managed endpoint create async call
workspace_ml_client.begin_create_or_update(endpoint).wait()

### Create managed online deployment for the endpoint

In [8]:
deployment_name = "deploy01"
model = f"{model_name}@latest"
env = f"{env_name}@latest"

In [9]:
# managed endpoint deployment
demo_deployment = ManagedOnlineDeployment(
    name=deployment_name,
    endpoint_name=online_endpoint_name,
    model=model,
    environment=env,
    code_configuration=CodeConfiguration(
        code="../onlinescoring",
        scoring_script="score.py",
    ),
    instance_type="Standard_NC24ads_A100_v4",
    instance_count=1,
    request_settings=OnlineRequestSettings(
        request_timeout_ms=120000,
    ),
    liveness_probe=ProbeSettings(
        initial_delay=600
    ),
)

In [10]:
# managed endpoint deplyment create async call
workspace_ml_client.online_deployments.begin_create_or_update(deployment=demo_deployment).wait()

Check: endpoint tinyllama-gguf-ep-gpu exists


...................................................................................................................

In [11]:
# update traffic to the deployment for 100%
endpoint.traffic = {deployment_name: 100}
workspace_ml_client.begin_create_or_update(endpoint).result()

ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://tinyllama-gguf-ep-gpu.eastus2.inference.ml.azure.com/score', 'openapi_uri': 'https://tinyllama-gguf-ep-gpu.eastus2.inference.ml.azure.com/swagger.json', 'name': 'tinyllama-gguf-ep-gpu', 'description': 'Online endpoint for tinyllama gguf', 'tags': {}, 'properties': {'createdBy': 'Purna Chandra Panda', 'createdAt': '2025-01-08T02:05:20.647465+0000', 'lastModifiedAt': '2025-01-08T02:05:20.647465+0000', 'azureml.onlineendpointid': '/subscriptions/6977e295-0d7c-4557-8e0b-26e2f6532103/resourcegroups/rg-mlws/providers/microsoft.machinelearningservices/workspaces/mlws01/onlineendpoints/tinyllama-gguf-ep-gpu', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/6977e295-0d7c-4557-8e0b-26e2f6532103/providers/Microsoft.MachineLearningServices/locations/eastus2/mfeOperationsStatus/oeidp:b81b1c5e-0151-42cf-9c96-ad079150a5ee:43dd2b8d-70c3-4693-8efb-9a2f1e6b1cb2?api-

### Test endpoint with sample data

In [12]:
# score the request file using the online endpoint with the azureml endpoint invoke method
response = workspace_ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    request_file="../payload/request1.json",
)

print(response)

{"output": "1. Le Louvre Museum\n2. Notre-Dame Cathedral\n3. Eiffel Tower\n4. The Grand Palais\n5. Mus\u00e9e d'Orsay\n6. Mus\u00e9e de la Vie\n7. Mus\u00e9e des Arts et Metiers\n8. Mus\u00e9e Jacquemart-Andr\u00e9\n9. Mus\u00e9e Galliera\n10. Mus\u00e9e Rodin\n\nRemember to check for safety restrictions and availability in advance, as the Covid-19 pandemic has impacted the travel industry."}


In [13]:
# score the request file using the online endpoint with the azureml endpoint invoke method
response = workspace_ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    request_file="../payload/request3.json",
)

print(response)

{"output": "The capital of India is New Delhi."}
