# Evaluate Demo

In [16]:
# Install "Evaluate" - https://github.com/RGGH/evaluate
# Configure .env
# !pip install llmeval-sdk

from llmeval import EvalClient

# Initialize the client
client = EvalClient(base_url="http://127.0.0.1:8080")

### Check Server

In [17]:
# Check server health
status = client.health_check()
print(status)

{'service': 'eval-api', 'status': 'healthy', 'version': '0.1.0'}


### Check models

In [18]:
# Get available models
models = client.get_models()
print(f"Available models: {models}")

Available models: ['anthropic:claude-opus-4', 'anthropic:claude-sonnet-4-5', 'anthropic:claude-haiku-4', 'gemini:gemini-2.5-pro', 'gemini:gemini-2.5-flash', 'ollama:llama3', 'ollama:gemma', 'openai:gpt-4o', 'openai:gpt-4o-mini', 'openai:gpt-3.5-turbo']


In [None]:
# Run a single evaluation
result = client.run_eval(
    model="gemini:gemini-2.5-pro",
    prompt="What is the capital of France?",
    expected="Paris",
    judge_model="ollama:llama3"
)

print(f"Model output: {result.model_output}")
print(f"Judge verdict: {result.judge_verdict}")
print(f"Passed: {result.passed}")

{'service': 'eval-api', 'status': 'healthy', 'version': '0.1.0'}
Available models: ['anthropic:claude-opus-4', 'anthropic:claude-sonnet-4-5', 'anthropic:claude-haiku-4', 'gemini:gemini-2.5-pro', 'gemini:gemini-2.5-flash', 'ollama:llama3', 'ollama:gemma', 'openai:gpt-4o', 'openai:gpt-4o-mini', 'openai:gpt-3.5-turbo']
Model output: The capital of France is **Paris**.
Judge verdict: Pass
Passed: True


In [4]:
# Run a single evaluation
result = client.run_eval(
    model="ollama:llama3",
    prompt="What is the capital of France?",
    expected="Paris",
    judge_model="gemini:gemini-2.5-pro"
)

print(f"Model output: {result.model_output}")
print(f"Judge verdict: {result.judge_verdict}")
print(f"Passed: {result.passed}")

Model output: The capital of France is Paris.
Judge verdict: Fail
Passed: False


In [5]:
# Run a single evaluation
result = client.run_eval(
    model="ollama:llama3",
    prompt="What is the capital of France?",
    expected="Paris",
    judge_model="gemini:gemini-2.5-flash"
)

print(f"Model output: {result.model_output}")
print(f"Judge verdict: {result.judge_verdict}")
print(f"Passed: {result.passed}")

Model output: The capital of France is Paris.
Judge verdict: Pass
Passed: True


In [9]:
from llmeval import EvalClient

# 1. Initialize the client (connects to your eval server)
client = EvalClient(base_url="http://localhost:8080")

# 2. Prepare multiple evaluation requests
batch_inputs = [
    {
        "model": "gemini:gemini-2.5-pro",
        "prompt": "What is 2 + 2?",
        "expected": "4",
        "judge_model": "gemini:gemini-2.5-flash",
        #"criteria": "Exact match" <-- fails if you ask for exact match!
    },
    {
        "model": "gemini:gemini-2.5-flash",
        "prompt": "Translate 'hello' to French",
        "expected": "bonjour",
        "judge_model": "gemini:gemini-2.5-pro",
        "criteria": "Semantic equivalence"
    },
    {
        "model": "gemini:gemini-2.5-flash",
        "prompt": "Who wrote '1984'?",
        "expected": "George Orwell",
        "judge_model": "gemini:gemini-2.5-pro",
        "criteria": "Exact match"
    },
]

# 3. Run them as a batch
batch_result = client.run_batch(batch_inputs)

# 4. Inspect summary stats
print(f"\n📊 Batch {batch_result.batch_id}")
print(f"Status: {batch_result.status}")
print(f"✅ Passed: {batch_result.passed}/{batch_result.total}")
print(f"📈 Pass rate: {batch_result.pass_rate:.2f}%")
print(f"⚡ Avg model latency: {batch_result.average_model_latency_ms} ms")
print(f"⚖️ Avg judge latency: {batch_result.average_judge_latency_ms} ms")

# 5. Inspect individual evals
for res in batch_result.results:
    print("\n--- Eval ---")
    print(f"Prompt: {res.prompt}")
    print(f"Model output: {res.model_output}")
    print(f"Expected: {res.expected}")
    print(f"Verdict: {res.judge_verdict}")
    print(f"Passed: {res.passed}")
    if res.judge_result and res.judge_result.reasoning:
        print(f"Reasoning: {res.judge_result.reasoning}")



📊 Batch 879df5f4-ac82-468b-83a5-533e24586183
Status: completed
✅ Passed: 2/3
📈 Pass rate: 66.67%
⚡ Avg model latency: 2112 ms
⚖️ Avg judge latency: 7909 ms

--- Eval ---
Prompt: What is 2 + 2?
Model output: 2 + 2 = 4.
Expected: 4
Verdict: Pass
Passed: True
Reasoning: Verdict: PASS
The core meaning of both outputs is the numerical value '4'. While the actual output provides additional context in the form of an equation, it ultimately arrives at and clearly states '4' as its result. Thus, the fundamental information conveyed—the number four—is consistent between both texts.

--- Eval ---
Prompt: Translate 'hello' to French
Model output: The most common and direct translation of "hello" to French is **Bonjour**.

You can also use:
*   **Salut** (more informal, like "hi")
Expected: bonjour
Verdict: Pass
Passed: True
Reasoning: Verdict: PASS
The actual output contains the expected answer, "Bonjour," and correctly identifies it as the primary translation. While the actual output is more ver