In [1]:
from azure.ai.ml import MLClient
from azure.identity import (
    DefaultAzureCredential,
)
from azure.ai.ml.entities import (
    AmlCompute, Model, Environment, BuildContext, CodeConfiguration, 
    ManagedOnlineEndpoint, ManagedOnlineDeployment, OnlineRequestSettings, ProbeSettings
)
from azure.ai.ml.constants import AssetTypes

import time, sys, json, os
from IPython.display import display, JSON

In [2]:
credential = DefaultAzureCredential()

ml_client = MLClient.from_config(
    credential
)

Found the config file in: /config.json


### Download model from hf and register

- In terminal, run `python [build-hf-env.py`](./job-register-hf-model/build-hf-env.py).
- In terminal, run `python [hf-model-job.py`](./job-register-hf-model/hf-model-job.py).
- It will use a serverless compute to download model from hf. 
- Once model is in that machine, it will be registered as model asset in workspace.

### Registered model details

In [3]:
model_name = "deepseek-r1-gguf"

# Make the models query fail-safe
models = ml_client.models.list()
model = next((m for m in models if m.name == model_name), None)

if model is not None:
    print(f"Model {model.name} already exists.")
else:
    print(f"Model {model_name} does not exist.")

Model deepseek-r1-gguf already exists.


### Register the environment asset in workspace

In [4]:
env_name = "deepseek-r1-gguf-env-gpu"

# Make the environments query fail-safe
envs = ml_client.environments.list()
env = next((e for e in envs if e.name == env_name), None)

# Flag to indicate if the environment is modified. 
# Set it manually:: If True, old env version is used. If False, new env version is created.
is_env_earlierone = True

if env is not None and is_env_earlierone:
    print(f"Environment {env_name} already exists.")
else:
    # Create a new environment based on the curated one
    custom_env = Environment(
        name=env_name,
        description="Custom environment with additional dependencies",
        build=BuildContext(path="../env/gpu")
    )

    # Register the custom environment
    ml_client.environments.create_or_update(custom_env)

    print("Environment is registered and be built now.")

Environment deepseek-r1-gguf-env-gpu already exists.


### Create managed online endpoint

In [5]:
# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name
online_endpoint_name = "deepseekr1-gguf-ep-gpu"

In [6]:
# managed endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="Online endpoint for deepseek-r1 gguf",
    auth_mode="key",
)

In [7]:
# managed endpoint create async call
ml_client.begin_create_or_update(endpoint).wait()

### Create managed online deployment for the endpoint

In [8]:
deployment_name = "deploy01"
model = f"{model_name}@latest"
env = f"{env_name}@latest"

In [9]:
# managed endpoint deployment
demo_deployment = ManagedOnlineDeployment(
    name=deployment_name,
    endpoint_name=online_endpoint_name,
    model=model,
    environment=env,
    code_configuration=CodeConfiguration(
        code="../onlinescoring",
        scoring_script="score.py",
    ),
    instance_type="Standard_NC48ads_A100_v4",
    instance_count=1,
    request_settings=OnlineRequestSettings(
        request_timeout_ms=120000,
    ),
    liveness_probe=ProbeSettings(
        initial_delay=600
    ),
)

In [10]:
# managed endpoint deplyment create async call
ml_client.online_deployments.begin_create_or_update(deployment=demo_deployment).wait()

Check: endpoint deepseekr1-gguf-ep-gpu exists
[32mUploading onlinescoring (0.02 MBs): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

............................................................................................................................................................................

In [None]:
# update traffic to the deployment for 100%
endpoint.traffic = {deployment_name: 100}
ml_client.begin_create_or_update(endpoint).result()

### Test endpoint with sample data

In [12]:
# score the request json file using the online endpoint with the azureml endpoint invoke method
response = ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    request_file="../payload/request3.json",
)

print(response)

{"output": "<think>\nOkay, the user is asking for the capital of India. I know that's a common question, so I need to make sure my answer is accurate and clear. Let me think... The capital city of India is New Delhi. Wait, sometimes people confuse it with Delhi. Is there a difference between Delhi and New Delhi? Yes, New Delhi is actually a part of the larger Delhi metropolitan area, which serves as the capital. So I should specify that to avoid confusion. Also, maybe add a bit about its significance, like being a political and administrative hub. That would make the answer more informative. Let me double-check: India's capital is indeed New Delhi. It was established during British rule and became the capital in 1911. Post-independence, it remained the capital. So I can mention that to give some historical context. Make sure not to get mixed up with other cities like Mumbai or Chennai. No, those are different. Okay, so putting it all together: The capital is New Delhi, part of the Delh

In [13]:
from pprint import pprint

# Assuming response is a dict
pprint(response)

('{"output": "<think>\\nOkay, the user is asking for the capital of India. I '
 "know that's a common question, so I need to make sure my answer is accurate "
 'and clear. Let me think... The capital city of India is New Delhi. Wait, '
 'sometimes people confuse it with Delhi. Is there a difference between Delhi '
 'and New Delhi? Yes, New Delhi is actually a part of the larger Delhi '
 'metropolitan area, which serves as the capital. So I should specify that to '
 'avoid confusion. Also, maybe add a bit about its significance, like being a '
 'political and administrative hub. That would make the answer more '
 "informative. Let me double-check: India's capital is indeed New Delhi. It "
 'was established during British rule and became the capital in 1911. '
 'Post-independence, it remained the capital. So I can mention that to give '
 'some historical context. Make sure not to get mixed up with other cities '
 'like Mumbai or Chennai. No, those are different. Okay, so putting it all '

In [15]:
# score the request json file using the online endpoint with the azureml endpoint invoke method
response = ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    request_file="../payload/request4_reasoning.json",
)

print(response)

{"output": "<think>\nOkay, so I'm trying to understand the concept of Mixture of Experts (MoE) in Large Language Models. The user mentioned first principles thinking, which means breaking it down into fundamental ideas and building up from there. Let me start by recalling what I know about machine learning models.\n\nI remember that traditional neural networks have layers where each neuron is connected to all others in the next layer\u2014like a fully connected layer. This can get really big as the model size increases, leading to high computational costs. But with MoE, maybe they found a way to make this more efficient?\n\nThe term \"Mixture of Experts\" sounds like it combines multiple specialized models (experts) and uses them together somehow. So instead of one big network, you have several smaller ones? Maybe each expert handles different parts of the input data. But how does that work exactly in the context of language models?\n\nI recall something about routing mechanisms. The m

In [16]:
pprint(response)

('{"output": "<think>\\nOkay, so I\'m trying to understand the concept of '
 'Mixture of Experts (MoE) in Large Language Models. The user mentioned first '
 'principles thinking, which means breaking it down into fundamental ideas and '
 'building up from there. Let me start by recalling what I know about machine '
 'learning models.\\n\\nI remember that traditional neural networks have '
 'layers where each neuron is connected to all others in the next '
 'layer\\u2014like a fully connected layer. This can get really big as the '
 'model size increases, leading to high computational costs. But with MoE, '
 'maybe they found a way to make this more efficient?\\n\\nThe term \\"Mixture '
 'of Experts\\" sounds like it combines multiple specialized models (experts) '
 'and uses them together somehow. So instead of one big network, you have '
 'several smaller ones? Maybe each expert handles different parts of the input '
 'data. But how does that work exactly in the context of language '
