In [None]:
##This is the same example as 3-test_EvalwithClient.py but in notebook

In [1]:
import os
from dotenv import load_dotenv
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval import evaluate
from azure.ai.projects import AIProjectClient
from azure.ai.agents.models import ListSortOrder
from azure.identity import AzureCliCredential

# Load environment variables
load_dotenv()

True

Define Azure Ai Project, get Ai Foundry Agent and run the agent

In [2]:
# Initialize Azure AI Project Client
def get_project_client():
    endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT")
    if not endpoint:
        raise ValueError("AZURE_AI_PROJECT_ENDPOINT not set in environment.")
    return AIProjectClient(endpoint=endpoint, credential=AzureCliCredential())

# Get agent by ID
def get_agent(client, agent_id):
    agent = client.agents.get_agent(agent_id)
    if not agent:
        raise ValueError(f"Agent with ID {agent_id} not found.")
    return agent

# Run agent thread and get response
def run_agent_thread(client, agent_id, user_message):
    final_run = client.agents.create_thread_and_process_run(
        agent_id=agent_id,
        thread={
            "messages": [
                {"role": "user", "content": user_message}
            ]
        },
    )
    if final_run.status != "completed":
        raise RuntimeError("Agent run did not complete successfully.")
    messages = client.agents.messages.list(thread_id=final_run.thread_id, order=ListSortOrder.ASCENDING)
    for message in messages:
        if message.run_id == final_run.id and message.text_messages:
            return message.text_messages[-1].text.value
    return ""

In [3]:
# Define evaluation metric
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradict any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
)

Get the Agent

In [4]:
client = get_project_client()
agent_id = "asst_IpYYAJDLI5qwEc2XcEe7CGkc"
user_question = "The dog chased the cat up the tree, who ran up the tree?"
agent = get_agent(client, agent_id)
response = run_agent_thread(client, agent.id, user_question)

Run the test

In [6]:
test_case = LLMTestCase(
        input=user_question,
        actual_output=response,
        expected_output="The dog."
    )

evaluate(test_cases=[test_case], metrics=[correctness_metric])



Metrics Summary

  - ❌ Correctness [GEval] (score: 0.0, threshold: 0.5, strict: False, evaluation model: Azure OpenAI (gpt-4.1), reason: The actual output directly contradicts the expected output by stating that the cat ran up the tree, while the expected output is 'The dog.' This is a clear factual contradiction and a critical error according to the evaluation steps., error: None)

For test case:

  - input: The dog chased the cat up the tree, who ran up the tree?
  - actual output: The cat ran up the tree.
  - expected output: The dog.
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Correctness [GEval]: 0.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Correctness [GEval]', threshold=0.5, success=False, score=0.0, reason="The actual output directly contradicts the expected output by stating that the cat ran up the tree, while the expected output is 'The dog.' This is a clear factual contradiction and a critical error according to the evaluation steps.", strict_mode=False, evaluation_model='Azure OpenAI (gpt-4.1)', error=None, evaluation_cost=0.001088, verbose_logs='Criteria:\nDetermine whether the actual output is factually correct based on the expected output. \n \nEvaluation Steps:\n[\n    "Check whether the facts in \'actual output\' contradict any facts in \'expected output\'",\n    "You should also heavily penalize omission of detail",\n    "Vague language, or contradicting OPINIONS, are OK"\n] \n \nRubric:\nNone \n \nScore: 0.0')], conversational=False, multimodal=False, input='The dog chased the cat up the tree, who ran 