In [None]:
import os 

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

azure_ai_project = os.environ.get("AZURE_PROJECT_ENDPOINT")
azure_openai_deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
azure_openai_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
azure_openai_api_key = os.environ.get("AZURE_OPENAI_API_KEY")  
azure_openai_api_version = os.environ.get("AZURE_OPENAI_API_VERSION")

# Function to create model-specific configuration
def get_model_config(model_name):

    api_key = azure_openai_api_key
    
    if model_name == "grok":
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": "grok-4",
            "api_key": api_key,
            "api_version": "2024-05-01-preview",
        }
    elif model_name == "gpt5":
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": "gpt-5-pro",
            "api_key": api_key,
            "api_version": "2024-12-01-preview",
        }
    elif model_name == "claude":
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": "claude-sonnet-4-5",
            "api_key": api_key,
            "api_version": "20250929",
        }
    else:
        # Default configuration from environment variables
        api_version = azure_openai_api_version
        if api_version and api_version < "2024-12-01-preview":
            api_version = "2024-12-01-preview"
        
        return {
            "azure_endpoint": azure_openai_endpoint,
            "azure_deployment": azure_openai_deployment,
            "api_key": azure_openai_api_key,
            "api_version": api_version,
        }

In [None]:
# Azure Credential imports
from azure.identity import AzureCliCredential

!az login

# Initialize Azure credentials
credential = AzureCliCredential()

Setup GPT-5-pro model if you want to evaluate only that one

In [None]:
import os 

# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

azure_ai_project = os.environ.get("AZURE_PROJECT_ENDPOINT")
azure_openai_deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT_GPT5_PRO")
azure_openai_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT_GPT5_PRO")
azure_openai_api_key = os.environ.get("AZURE_OPENAI_API_KEY_GPT5_PRO")  
azure_openai_api_version = os.environ.get("AZURE_OPENAI_API_VERSION_GPT5_PRO")

model_config = {
    "azure_endpoint": azure_openai_endpoint,
    "azure_deployment": azure_openai_deployment,
    "api_key": azure_openai_api_key,
}

Get all ground trut to a variable

In [None]:
import json
import pathlib

path = str(pathlib.Path(pathlib.Path.cwd())) + "/data/hiking_data.jsonl"

# Load the dataset content from the local file
with open(path, "r") as f:
    data = [json.loads(line) for line in f]

ground_truth = [item["ground_truth"] for item in data]
print(f"Loaded {len(ground_truth)} ground truth values")

In [None]:
from azure.ai.projects import AIProjectClient 

# Create the project client (Foundry project and credentials): 

project_client = AIProjectClient( 
    endpoint=azure_ai_project, 
    credential=credential, 
)

In [None]:
import json

from openai.types.evals.create_eval_jsonl_run_data_source_param import (
    CreateEvalJSONLRunDataSourceParam,
    SourceFileID,
)
from azure.ai.projects.models import (
    DatasetVersion,
)

print("Creating an OpenAI client from the AI Project client")
client = project_client.get_openai_client()

#dataset: DatasetVersion = project_client.datasets.upload_file(
#    name="software_engineering_data",
#    version=1,
#    file_path=path,
#)
#print(dataset)
#dataset: DatasetVersion = project_client.datasets.get(
#    name="software_engineering_data",
#    version=1,
#)

# Load the dataset content from the local file
with open(path, "r") as f:
    data = [json.loads(line) for line in f]

conversation = client.conversations.create(
        items=[{"type": "system", "role": "system", "content": "You are an expert in hiking and know all about outdoor activities. Keep your responses brief and to the point."}] + 
                   [{"type": "message", "role": "user", "content": item["query"]} for item in data],
)
print(f"Created conversation with {len(data)} user messages (id: {conversation.id})")

response = client.responses.create(
    conversation=conversation.id,
    model=azure_openai_deployment,
    input="",
)
print(f"Response output: {response.output_text} (id: {response.id})")

In [None]:
evaluation_criteria = [
            {
                "type": "azure_ai_evaluator", 
                "name": "violence_detection", 
                "evaluator_name": "builtin.violence"
            },
            {
                "type": "azure_ai_evaluator",
                "name": "self_harm",
                "evaluator_name": "builtin.self_harm",
                "evaluator_version": "1",
            },
            {"type": "azure_ai_evaluator", "name": "Sexual", "evaluator_name": "builtin.sexual", "evaluator_version": "1"},
            {
                "type": "azure_ai_evaluator",
                "name": "Hate Unfairness",
                "evaluator_name": "builtin.hate_unfairness",
                "evaluator_version": "1",
            },
            {
                "type": "azure_ai_evaluator",
                "name": "coherence",
                "evaluator_name": "builtin.coherence",
                "initialization_parameters": {
                    "deployment_name": f"{azure_openai_deployment}"
                },
            },
        ]


In [None]:
import datetime
#data_source = {
#        "type": "azure_ai_target_completions",
#        "source": {
#            "type": "file_content",
#            "content": json.dumps(dataset.as_dict()),
#        },
#        "input_messages": {
#            "type": "template",
#            "template": [
#                {"type": "message", "role": "user", "content": {"type": "input_text", "text": "{{item.query}}"}}
#            ],
#        },
#        "target": {
#            "type": "azure_ai_model",
#            "model": azure_openai_deployment,
#        },
#    }
data_source_config = {"type": "azure_ai_source", "scenario": "responses"}
data_source = {
        "type": "azure_ai_responses",
        "item_generation_params": {
            "type": "response_retrieval",
            "data_mapping": {"response_id": "{{item.response}}"},
            "source": {"type": "file_content", "content": [{"item": {"response": response.id, "ground_truth": gt}} for response, gt in zip([response]*len(ground_truth), ground_truth)]},
        },
}



eval_object = client.evals.create(
    name="vk-2255-eval",
    data_source_config=data_source_config,
    testing_criteria=evaluation_criteria,
)
results = client.evals.runs.create(
    eval_id=eval_object.id,
    name="eval_id_run" + azure_openai_deployment + datetime.datetime.now().strftime("%Y%m%d%H%M%S"),
    data_source=data_source,
)
print(results)

Evaluations

In [None]:
import pathlib
import random

from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    RelevanceEvaluator,
    CoherenceEvaluator,
    GroundednessEvaluator,
    #QAEvaluator,
    #F1ScoreEvaluator,
    FluencyEvaluator,
)
from model_endpoints import ModelEndpoints

models = [
    "grok",
    "gpt5",
    "claude",
]

# Define a simple answer length evaluator
def answer_length(response, **kwargs):
    return {"answer_length": len(response)}

path = str(pathlib.Path(pathlib.Path.cwd())) + "/data/software_engineering_data.jsonl"

for model in models:
    # Get model-specific configuration
    model_config = get_model_config(model)
    
    randomNum = random.randint(1111, 9999)
    
    # Create evaluators with model-specific configuration
    relevance_evaluator = RelevanceEvaluator(model_config, is_reasoning_model=True)
    coherence_evaluator = CoherenceEvaluator(model_config, is_reasoning_model=True)
    groundedness_eval = GroundednessEvaluator(model_config=model_config, is_reasoning_model=True)
    #f1_eval = F1ScoreEvaluator(threshold=0.6)
    #qa_eval = QAEvaluator(model_config=model_config, is_reasoning_model=True)
    fluency_eval = FluencyEvaluator(model_config=model_config, is_reasoning_model=True)
    
    print(f"Running evaluation for model: {model}")
    evaluation_name = "Eval-Run-" + str(randomNum) + "-" + model.title()
    results = evaluate(
        evaluation_name=evaluation_name,
        data=path,
        target=ModelEndpoints(model),
        evaluators={
            "coherence": coherence_evaluator,
            "relevance": relevance_evaluator,
            #"groundedness": groundedness_eval,
            "answer_length": answer_length,
            #"qa": qa_eval,
            #"f1-score": f1_eval,
            "fluency": fluency_eval
        },
        azure_ai_project=azure_ai_project,
        evaluator_config={
            "relevance": {
                "column_mapping": {
                    "response": "${target.response}",
                    "context": "${data.context}",
                    "query": "${data.query}",
                },
            },
        },
        output_path="./software_engineering-" + evaluation_name + ".json"
    )