In [None]:
import os 

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

azure_ai_project = os.environ.get("AZURE_PROJECT_ENDPOINT")
azure_openai_deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
azure_openai_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
azure_openai_api_key = os.environ.get("AZURE_OPENAI_API_KEY")  
azure_openai_api_version = os.environ.get("AZURE_OPENAI_API_VERSION")

# Function to create model-specific configuration
def get_model_config(model_name):

    api_key = azure_openai_api_key
    
    if model_name == "grok":
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": "grok-4",
            "api_key": api_key,
            "api_version": "2024-05-01-preview",
        }
    elif model_name == "gpt5":
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": "gpt-5-pro",
            "api_key": api_key,
            "api_version": "2024-12-01-preview",
        }
    elif model_name == "claude":
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": "claude-sonnet-4-5",
            "api_key": api_key,
            "api_version": "20250929",
        }
    else:
        # Default configuration from environment variables
        api_version = azure_openai_api_version
        if api_version and api_version < "2024-12-01-preview":
            api_version = "2024-12-01-preview"
        
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": azure_openai_deployment,
            "api_key": azure_openai_api_key,
            "api_version": api_version,
        }

In [None]:
# Azure Credential imports
from azure.identity import AzureCliCredential

!az login

# Initialize Azure credentials
credential = AzureCliCredential()

Get all ground trut to a variable

In [None]:
import json
import pathlib

path = str(pathlib.Path(pathlib.Path.cwd())) + "/data/software_engineering_data.jsonl"

# Load the dataset content from the local file
with open(path, "r") as f:
    data = [json.loads(line) for line in f]

ground_truth = [item["ground_truth"] for item in data]
print(f"Loaded {len(ground_truth)} ground truth values")

Setup AI Foundry projec client

In [None]:
from azure.ai.projects import AIProjectClient 

# Create the project client (Foundry project and credentials): 

project_client = AIProjectClient( 
    endpoint=azure_ai_project, 
    credential=credential, 
)
print("Creating an OpenAI client from the AI Project client")
client = project_client.get_openai_client()

In [None]:
import json

from openai.types.evals.create_eval_jsonl_run_data_source_param import (
    CreateEvalJSONLRunDataSourceParam,
    SourceFileID,
)
from azure.ai.projects.models import (
    DatasetVersion,
)

def get_responses(path, azure_openai_deployment):
    #dataset: DatasetVersion = project_client.datasets.upload_file(
    #    name="software_engineering_data",
    #    version=1,
    #    file_path=path,
    #)
    #print(dataset)
    #dataset: DatasetVersion = project_client.datasets.get(
    #    name="software_engineering_data",
    #    version=1,
    #)

    # Load the dataset content from the local file
    with open(path, "r") as f:
        data = [json.loads(line) for line in f]

    conversation = client.conversations.create(
            items=[{"type": "message", "role": "user", "content": item["query"]} for item in data],
    )
    print(f"Created conversation with {len(data)} user messages (id: {conversation.id})")

    response = client.responses.create(
        conversation=conversation.id,
        model=azure_openai_deployment,
        input="",
    )
    print(f"Response output: {response.output_text} (id: {response.id})")
    return response

In [None]:
data_source_config = {"type": "azure_ai_source", "scenario": "responses"}

evaluation_criteria = [
            {
                "type": "azure_ai_evaluator", 
                "name": "violence_detection", 
                "evaluator_name": "builtin.violence"
            },
            {
                "type": "azure_ai_evaluator",
                "name": "self_harm",
                "evaluator_name": "builtin.self_harm",
                "evaluator_version": "1",
            },
            {"type": "azure_ai_evaluator", "name": "Sexual", "evaluator_name": "builtin.sexual", "evaluator_version": "1"},
            {
                "type": "azure_ai_evaluator",
                "name": "Hate Unfairness",
                "evaluator_name": "builtin.hate_unfairness",
                "evaluator_version": "1",
            },
            {
                "type": "azure_ai_evaluator",
                "name": "coherence",
                "evaluator_name": "builtin.coherence",
                "initialization_parameters": {
                    "deployment_name": f"{azure_openai_deployment}"
                },
            },
]


In [None]:
import datetime

eval_object = client.evals.create(
        name="vk-2255-eval_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S"),
        data_source_config=data_source_config,
        testing_criteria=evaluation_criteria,
)

def evaluate_model(response, model_name):
    data_source = {
            "type": "azure_ai_responses",
            "item_generation_params": {
                "type": "response_retrieval",
                "data_mapping": {"response_id": "{{item.resp_id}}"},
                "source": {"type": "file_content", "content": [{"item": {"resp_id": response.id}}]},
            },
    }
    
    results = client.evals.runs.create(
        eval_id=eval_object.id,
        name="eval_id_run_" + model_name + "_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S"),
        data_source=data_source,
    )
    print(results)

Evaluations

In [None]:
import pathlib


models = [
    "grok",
    "gpt5",
    "claude",
]

# Define a simple answer length evaluator
def answer_length(response, **kwargs):
    return {"answer_length": len(response)}

path = str(pathlib.Path(pathlib.Path.cwd())) + "/data/software_engineering_data.jsonl"

for model in models:
    # Get model-specific configuration
    model_config = get_model_config(model)
    
    print(f"Running evaluation for model: {model}")
    response = get_responses(path, model_config["azure_deployment"])
    evaluate_model(response, model)
    