In [0]:
%pip install mlflow==2.20.1 openai==1.61.0 databricks-agents httpx==0.27.2
%restart_python

In [0]:
from mlflow.models import ModelConfig
config = ModelConfig(development_config='config.yaml')

import os
os.environ["API_SECRET_KEY"] = dbutils.secrets.get('shm', config.get("api_secret_key"))
os.environ["SEARCH_SECRET_KEY"] = dbutils.secrets.get('shm', config.get("search_secret_key"))

Load the Test Approaches and Questions

In [0]:
from pyfunc_azure_chat_retriever import AzureChatRetriever

approach = {
    'name': 'azurechatretreiver',
    'file': 'pyfunc_azure_chat_retriever.py',
    'model': AzureChatRetriever(),
    'input_example': {
        "messages": [
          {"role": "user", "content": "What is Databricks?"}
        ]
    },
}

test_questions = [
  'What is Databricks?',
  'How does Databricks handle data storage?',
  'What are the key features of Databricks?',
  'How can I integrate Databricks with other tools?',
  'What is the Databricks Lakehouse Platform?',
  'How does Databricks support machine learning?',
  'What are the benefits of using Databricks?',
  'How can I secure my data in Databricks?',
  'What is the pricing model for Databricks?',
  'How can I get started with Databricks?',
  'What is the role of Apache Spark in Databricks?'
]

Test the approach, test MLFlow signature

In [0]:
import mlflow
from mlflow.models import infer_signature
from mlflow.models.signature import ModelSignature

model = approach['model']
input_example = approach['input_example']
prediction = model.predict(None, input_example['messages'])

signature = infer_signature(
  model_input=input_example,
  model_output=prediction
  )
signature

Time Raw Endpoint Calls via Class Only

In [0]:
import time
import numpy as np
import mlflow.pyfunc

# Run test questions against the reloaded model and track the time for each response
response_times = []
for question in test_questions:
    start_time = time.time()
    model.predict(None, [{"role": "user", "content": question}])
    end_time = time.time()
    response_times.append(end_time - start_time)

print(f"Response Time {np.mean(response_times):.2f} +- {np.std(response_times):.2f}")

Log and Register the Model

In [0]:
with mlflow.start_run():
    # Set the registry URI to Unity Catalog if needed
    mlflow.set_registry_uri('databricks-uc')

    # Log the model in MLflow with the signature  
    logged_agent_info = mlflow.pyfunc.log_model(
        python_model=approach['file'],
        model_config='config.yaml',
        artifact_path="model",
        input_example=input_example,
        pip_requirements=[
            "mlflow==2.20.1",
            "openai==1.61.0", 
            "httpx==0.27.2",
            "databricks-sdk[openai]",
            "databricks-agents==0.12.0"
            ]
        )

    print(f"Model logged with URI: {logged_agent_info.model_uri}")

Register the Model in UC and Deploy

In [0]:
from databricks import agents

mlflow.set_registry_uri("databricks-uc")

model_name = f'shm.default.{approach["name"]}'

uc_model_info = mlflow.register_model(
    model_uri=logged_agent_info.model_uri, 
    name=model_name
)

api_secret_key_name = config.get("api_secret_key")
search_secret_key_name = config.get("search_secret_key")

# Deploy to enable the review app and create an API endpoint
deployment_info = agents.deploy(
    model_name=model_name, 
    model_version=uc_model_info.version,
    environment_vars={
        "API_SECRET_KEY": f"{{{{secrets/shm/{api_secret_key_name}}}}}",
        "SEARCH_SECRET_KEY": f"{{{{secrets/shm/{search_secret_key_name}}}}}",
      }
)

Reload the packaged model and profile local inference without serving

In [0]:
import time
import numpy as np
import mlflow.pyfunc

# Load the model
reloaded_model = mlflow.pyfunc.load_model(
    f"models:/{uc_model_info.name}/{uc_model_info.version}"
    )

# Run test questions against the reloaded model and track the time for each response
response_times = []
for question in test_questions:
    start_time = time.time()
    response = reloaded_model.predict({"messages": [{"role":"user","content":question}]})
    end_time = time.time()
    response_times.append(end_time - start_time)

print(f"Response Time {np.mean(response_times):.2f} +- {np.std(response_times):.2f}")

Test the Serving Endpoint Example

In [0]:
import time
import requests
import numpy as np
import mlflow.pyfunc

serving_endpoint_name = f'agents_shm-default-{approach["name"]}'

API_URL = f"https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints/{serving_endpoint_name}/invocations"
API_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# Run test questions against the reloaded model and track the time for each response
response_times = []
for question in test_questions:
    start_time = time.time()
    data = {"messages": [{"role":"user","content":question}]}
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {API_TOKEN}"}
    response = requests.post(url=API_URL, json=data, headers=headers)
    end_time = time.time()
    response_times.append(end_time - start_time)

print(f"Response Time {np.mean(response_times):.2f} +- {np.std(response_times):.2f}")