In [40]:
%pip install autoevals duckdb braintrust openai python-dotenv nest_asyncio --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [36]:
from dotenv import load_dotenv
import os

load_dotenv()

import braintrust
from openai import AsyncOpenAI

braintrust.login(api_key=os.environ["BRAINTRUST_API_KEY"])
client = braintrust.wrap_openai(AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"]))

In [37]:
from dataclasses import dataclass
from typing import List, Tuple


@dataclass
class Exchange:
    speaker: str  # "Student" or "Teacher"
    message: str

@dataclass
class StudentTeacherDialog:
    background: str
    dialog: List[Exchange]

sample_dialog = StudentTeacherDialog(
    background="This is a discussion about the importance of photosynthesis.",
    dialog=[
        Exchange(speaker="Student", message="I got this problem 'James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?' The answer I got is 312. Is it correct?"),
        Exchange(speaker="Teacher", message="No, that is not correct. The correct answer is actually 624"),
        Exchange(speaker="Student", message="Oh, thx. I will write it down")
    ]
)

dialogs = [sample_dialog]

In [43]:
import json

PROMPT = """
You are comparing a student's answer to a teacher's answer in a dialog. Here is the data:
[BEGIN DATA]
************
[Background]: {background}
************
{conversation}
************
[END DATA]

Please rate the teacher's effectiveness in teaching the student on a scale of 1 to 10.
Provide your answer in the following structured format:
Rating: <numeric_rating>
Reasoning: <your_reasoning>
"""

@braintrust.traced
async def numeric_rater(background, conversation):
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": PROMPT.format(background=background, conversation=conversation),
            }
        ],
        temperature=0,
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "rate",
                    "description": "Rate the teacher's effectiveness and provide reasoning.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "reasons": {
                                "description": "Write out in a step-by-step manner your reasoning to ensure the conclusion is correct.",
                                "type": "string",
                            },
                            "rating": {
                                "description": "The numeric rating on a scale of 1 to 10.",
                                "type": "integer",
                                "minimum": 1,
                                "maximum": 10,
                            },
                        },
                        "required": ["rating", "reasons"],
                    },
                },
            }
        ],
        tool_choice={"type": "function", "function": {"name": "rate"}},
    )
    arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
    return {
        "rating": arguments["rating"],
        "reasoning": arguments["reasons"]
    }

async def evaluate_dialogs(dialogs):
    results = []
    for dialog in dialogs:
        conversation = "\n".join(
            f"[{exchange.speaker}]: {exchange.message}" for exchange in dialog.dialog
        )
        result = await numeric_rater(dialog.background, conversation)
        results.append(result)
    return results


In [44]:
# Run the evaluation
import asyncio
import nest_asyncio

nest_asyncio.apply()
results = asyncio.run(evaluate_dialogs(dialogs))

# Display the results
for result in results:
    print(json.dumps(result, indent=2))


{
  "rating": 5,
  "reasoning": "The teacher provided the correct answer to the student's question, which is essential for effective teaching. However, the teacher did not explain why the student's answer was incorrect or how to arrive at the correct answer. This lack of explanation means the student may not understand the reasoning behind the correct answer, which is crucial for learning and preventing similar mistakes in the future. Therefore, while the teacher was effective in correcting the student's mistake, the lack of explanation limits the overall effectiveness of the teaching."
}
