In [None]:
import os
import json
import requests
from time import sleep, time
from openai import OpenAI
import asyncio

from config import *

In [None]:
os.environ["NVIDIA_DATASET_NAMESPACE"] = NMS_NAMESPACE
os.environ["NVIDIA_PROJECT_ID"] = PROJECT_ID

## Inference env vars
os.environ["NVIDIA_BASE_URL"] = NIM_URL

# Data Store env vars
os.environ["NVIDIA_DATASETS_URL"] = ENTITY_STORE_URL

## Customizer env vars
os.environ["NVIDIA_CUSTOMIZER_URL"] = CUSTOMIZER_URL
os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = CUSTOMIZED_MODEL_DIR

# Evaluator env vars
os.environ["NVIDIA_EVALUATOR_URL"] = EVALUATOR_URL

# Guardrails env vars
os.environ["GUARDRAILS_SERVICE_URL"] = GUARDRAILS_URL


In [None]:
from llama_stack.core.library_client import LlamaStackAsLibraryClient

client = LlamaStackAsLibraryClient("nvidia")
client.initialize()

In [None]:
from llama_stack.apis.common.job_types import JobStatus
from llama_stack.core.datatypes import Api

async def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):
    start_time = time()
    
    # Access eval through impls
    eval_impl = client.async_client.impls[Api.eval]
    
    job_status = await eval_impl.job_status(benchmark_id=benchmark_id, job_id=job_id)

    print(f"Waiting for Evaluation job {job_id} to finish.")
    print(f"Job status: {job_status.status} after {time() - start_time} seconds.")

    while job_status.status in [JobStatus.scheduled, JobStatus.in_progress]:
        await asyncio.sleep(polling_interval)
        job_status = await eval_impl.job_status(benchmark_id=benchmark_id, job_id=job_id)

        print(f"Job status: {job_status.status} after {time() - start_time} seconds.")

        if time() - start_time > timeout:
            raise RuntimeError(f"Evaluation Job {job_id} took more than {timeout} seconds.")

    return job_status

In [None]:
print(f"Data Store endpoint: {DATA_STORE_URL}")
print(f"Entity Store endpoint: {ENTITY_STORE_URL}")
print(f"Customizer endpoint: {CUSTOMIZER_URL}")
print(f"Evaluator endpoint: {EVALUATOR_URL}")
print(f"NIM endpoint: {NIM_URL}")
print(f"Namespace: {NMS_NAMESPACE}")
print(f"Base Model: {BASE_MODEL}")

In [None]:
CUSTOMIZED_MODEL = "nvidia-tool-calling-tutorial/test-llama-stack@v1"

In [None]:
models = client.models.list()
model_ids = [model.identifier for model in models]

assert f"nvidia/{CUSTOMIZED_MODEL}" in model_ids, \
    f"Model {CUSTOMIZED_MODEL} not registered"

In [None]:
resp = requests.get(f"{NIM_URL}/v1/models")

models = resp.json().get("data", [])
model_names = [model["id"] for model in models]

assert CUSTOMIZED_MODEL in model_names, \
    f"Model {CUSTOMIZED_MODEL} not found"

In [None]:
model_names, model_ids

In [None]:
repo_id = f"{NMS_NAMESPACE}/{DATASET_NAME}"
print(repo_id)

In [None]:
response = requests.get(url=f"{ENTITY_STORE_URL}/v1/datasets/{repo_id}")
assert response.status_code in (200, 201), \
    f"Dataset {repo_id} not found in Entity Store (status {response.status_code}): {response.text}"

dataset_info = response.json()
print(f"✓ Dataset '{DATASET_NAME}' exists in Entity Store")
print(f"  Full ID: {repo_id}")
print(f"  Files URL: {dataset_info['files_url']}")


In [None]:
datasets = client.datasets.list()
dataset_ids = [dataset.identifier for dataset in datasets]
assert DATASET_NAME in dataset_ids, \
    f"Dataset {DATASET_NAME} not registered"

In [None]:
dataset_ids

In [None]:
print(f"✓ Using dataset: {repo_id}")

In [None]:
response = requests.get(url=f"{ENTITY_STORE_URL}/v1/datasets/{repo_id}")
assert response.status_code in (200, 201), f"Status Code {response.status_code} Failed to fetch dataset {response.text}"

print("Files URL:", response.json()["files_url"])

In [None]:
benchmark_id = "simple-tool-calling-1"
simple_tool_calling_eval_config = {
    "type": "custom",
    "tasks": {
        "custom-tool-calling": {
            "type": "chat-completion",
            "dataset": {
                "files_url": f"hf://datasets/{NMS_NAMESPACE}/{DATASET_NAME}/testing/xlam-test-single.jsonl",
                "limit": 50
            },
            "params": {
                "template": {
                    "messages": "{{ item.messages | tojson}}",
                    "tools": "{{ item.tools | tojson }}",
                    "tool_choice": "auto"
                }
            },
            "metrics": {
                "tool-calling-accuracy": {
                    "type": "tool-calling",
                    "params": {"tool_calls_ground_truth": "{{ item.tool_calls | tojson }}"}
                }
            }
        }
    }
}


In [None]:
try:
    response = client.benchmarks.register(
        benchmark_id=benchmark_id,
        dataset_id=repo_id,
        scoring_functions=[],
        metadata=simple_tool_calling_eval_config
    )
    print(f"✓ Registered benchmark '{benchmark_id}'")
except Exception as e:
    if "409" in str(e) or "Conflict" in str(e) or "already exists" in str(e):
        print(f"✓ Benchmark '{benchmark_id}' already registered")
    else:
        print(f"Error registering benchmark: {e}")
        raise

In [None]:
response = client.benchmarks.register(
    benchmark_id=benchmark_id,
    dataset_id=repo_id,
    scoring_functions=[],
    metadata=simple_tool_calling_eval_config
)

In [None]:
import requests
import json

# Try registering with colon instead of slash (meta:llama-3.2-1b-instruct)
# Or see if we can create an alias
model_payload = {
    "namespace": "meta",  # Use meta as the namespace
    "name": "llama-3.2-1b-instruct",
    "description": "Base Llama 3.2 1B Instruct model",
    "type": "llm",
}

try:
    response = requests.post(
        f"{ENTITY_STORE_URL}/v1/models",
        json=model_payload
    )
    response.raise_for_status()
    print("✓ Registered model in 'meta' namespace!")
    print(json.dumps(response.json(), indent=2))
except requests.HTTPError as e:
    print(f"Status: {e.response.status_code}")
    print(f"Response: {e.response.text}")


In [None]:
from llama_stack.core.datatypes import Api
from llama_stack.apis.eval import BenchmarkConfig, EvalCandidate

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Create the benchmark config using proper data types
from llama_stack.apis.eval import ModelCandidate, SamplingParams

benchmark_config = BenchmarkConfig(
    eval_candidate=ModelCandidate(
        type="model",
        model=BASE_MODEL,
        sampling_params=SamplingParams()
    )
)

# Create evaluation job
response = await eval_impl.run_eval(
    benchmark_id=benchmark_id,
    benchmark_config=benchmark_config
)

job_id = response.job_id
print(f"Created evaluation job: {job_id}")



In [None]:
job = await wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)

In [None]:
import requests
import json

# Get the full job details to see the error
response = requests.get(f"{EVALUATOR_URL}/v1/evaluation/jobs/{job_id}")
response.raise_for_status()
job_details = response.json()

print("Job status:", job_details.get("status"))
print("\nStatus details:")
if "status_details" in job_details:
    print(json.dumps(job_details["status_details"], indent=2))
    
# print("\nFull job details:")
# print(json.dumps(job_details, indent=2))

In [None]:
from llama_stack.core.datatypes import Api

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Retrieve job results
job_results = await eval_impl.job_result(benchmark_id=benchmark_id, job_id=job_id)
print(f"Job results: {json.dumps(job_results.model_dump(), indent=2)}")

In [None]:
aggregated_results = job_results.scores[benchmark_id].aggregated_results
base_function_name_accuracy_score = aggregated_results["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_accuracy"]["value"]
base_function_name_and_args_accuracy = aggregated_results["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_and_args_accuracy"]["value"]

print(f"Base model: function_name_accuracy: {base_function_name_accuracy_score}")
print(f"Base model: function_name_and_args_accuracy: {base_function_name_and_args_accuracy}")

In [None]:
from llama_stack.core.datatypes import Api
from llama_stack.apis.eval import BenchmarkConfig, EvalCandidate

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Create the benchmark config using proper data types
from llama_stack.apis.eval import ModelCandidate, SamplingParams

benchmark_config = BenchmarkConfig(
    eval_candidate=ModelCandidate(
        type="model",
        model=CUSTOMIZED_MODEL,
        sampling_params=SamplingParams()
    )
)

# Create evaluation job
response = await eval_impl.run_eval(
    benchmark_id=benchmark_id,
    benchmark_config=benchmark_config
)

job_id = response.job_id
print(f"Created evaluation job: {job_id}")

In [None]:
job = await wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)

In [None]:
from llama_stack.core.datatypes import Api

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Retrieve job results
job_results = await eval_impl.job_result(benchmark_id=benchmark_id, job_id=job_id)
print(f"Job results: {json.dumps(job_results.model_dump(), indent=2)}")

In [None]:
aggregated_results_custom = job_results.scores[benchmark_id].aggregated_results
custom_function_name_accuracy_score = aggregated_results_custom["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_accuracy"]["value"]
custom_function_name_and_args_accuracy = aggregated_results_custom["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_and_args_accuracy"]["value"]

print(f"Custom model: function_name_accuracy: {custom_function_name_accuracy_score}")
print(f"Custom model: function_name_and_args_accuracy: {custom_function_name_and_args_accuracy}")