Introduction

In [1]:
# Automatically reload modules before executing each cell
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Any

from inspect_ai import eval

from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import exact, scorer, Score, Target
from inspect_ai.scorer._metrics import accuracy, stderr
from inspect_ai.solver import solver, TaskState
from inspect_ai.agent import bridge

from inspect_ai.dataset import FieldSpec, json_dataset

In [3]:
@solver
def run_agent(agent_instance):
    async def run(sample: dict[str, Any]) -> dict[str, Any]:
        state = {"messages": [{"role": "user", "content": sample["input"]}]}
        response = agent_instance.invoke(state)

        assistant_msg = next(
            (m.content for m in reversed(response["messages"])),
            None
        )
        return {"output": assistant_msg}

    return run

In [4]:
@scorer(metrics=[accuracy(), stderr()])
def score_agent():
    async def score(state: TaskState, target: Target):
        answer = state.output.completion
        # use LLM-as-a-Judge to evaluate
        correct = (answer.strip() == target.text.strip())
        return Score(
            value=correct,
            answer=answer,
        )

    return score

In [5]:
@task
def agent_task(dataset, agent_instance):
    return Task(
        dataset=dataset,
        # Can only use bridge for non-LLM calls
        solver=bridge(run_agent(agent_instance)),
        # scorer=exact(),
        scorer=score_agent(),
    )

Scenario 1:

In [6]:
import os
import sys

current_dir = os.getcwd()
sys.path.append(current_dir)

from agents.retail.agent import retail_agent as agent

dataset_path = './datasets/retail/dataset.jsonl'
dataset = json_dataset(
        dataset_path,
        FieldSpec(
            input="question",
            target="target",
        ),
    )

eval(agent_task(dataset, agent))

Output()