This code invokes an LLM (gpt-4o-mini) with the provided prompt and returns the results

In [None]:
pip install python-frontmatter


In [67]:
import io
import zipfile
import requests
import frontmatter




In [68]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.

    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name

    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))

    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md')
            or filename_lower.endswith('.mdx')):
            continue

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [69]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [70]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.

    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)

    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)

    return sections


In [80]:
from minsearch import Index
fin_faq = [
    {"question": "What was Apple's revenue in Q4 2022?", "answer": "Apple reported revenue of $90.15 billion in Q4 2022."},
    {"question": "Does Tesla disclose revenue from regulatory credits?", "answer": "Yes, Tesla reported $286 million in revenue from regulatory credits in Q2 2022."},
    {"question": "What percentage of Microsoft’s revenue comes from cloud services?", "answer": "In FY2022, Microsoft reported that 51% of its revenue came from cloud-based services."},
    {"question": "How much cash did Alphabet hold at the end of 2021?", "answer": "Alphabet reported cash and cash equivalents of $20.9 billion at the end of 2021."},
    {"question": "Did Amazon’s advertising revenue grow in 2021?", "answer": "Yes, Amazon’s advertising revenue grew 32% in 2021, reaching $31 billion."}
]
fin_index = Index(
    text_fields=["question", "content"],
    keyword_fields=[]
)

fin_index.fit(fin_faq)


[{'content': '# FinanceBench: A New Benchmark for Financial Question Answering\n\n<p align="center">\n    <img src="fig1.png" alt="drawing" style="width: 400px; display: block; margin: 0 auto; text-align:center;"/>\n</p>\n\n**Abstract:** \nFinanceBench is a first-of-its-kind test suite for evaluating the performance of LLMs on open book financial question answering (QA). This repository contains an open source sample of 150 annotated examples used in the evaluation and analysis of models assessed in the FinanceBench paper. FinanceBench comprises 10,231 questions about publicly traded companies, with corresponding answers and evidence strings. The questions in FinanceBench are ecologically valid and cover a diverse set of scenarios. They are intended to be clear-cut and straightforward to answer to serve as a minimum performance standard. We test 16 state of the art model configurations (including GPT-4-Turbo, Llama2 and Claude2, with vector stores and long context prompts) on a sample 

In [None]:
fin_index = Index(
    text_fields=["question", "answer"],
    keyword_fields=[]
)

fin_index.fit(fin_faq)

In [None]:
%pip install minsearch

In [78]:
from tqdm.auto import tqdm
import numpy as np


from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

fin_embeddings = []

for d in tqdm(fin_faq):
    # Check if 'question' and 'answer' keys exist before accessing them
    if 'question' in d and 'answer' in d:
        text = d['question'] + ' ' + d['answer']
        v = embedding_model.encode(text)
        fin_embeddings.append(v)
    else:
        print(f"Skipping document due to missing 'question' or 'answer' key: {d.get('filename', 'Unknown file')}")


fin_embeddings = np.array(fin_embeddings)
print(fin_embeddings)

  0%|          | 0/2 [00:00<?, ?it/s]

Skipping document due to missing 'question' or 'answer' key: financebench-main/README.md
Skipping document due to missing 'question' or 'answer' key: financebench-main/vectorstores/README.md
[]


In [None]:
display(fin_faq)

Pydantic AI(Agentic Library

In [40]:
from typing import List, Any

def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    return fin_index.search(query, num_results=2)


In [41]:
system_prompt = """
You are a helpful assistant for a  course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""


In [42]:
from pydantic_ai import Agent
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

agent = Agent(
     name="faq_agent",
     instructions=system_prompt,
     tools=[text_search],
     model='gpt-4o-mini'
 )

In [None]:
question = "What was Apple's revenue in Q4 2022"



In [45]:
import asyncio



In [None]:
result = await agent.run(user_prompt=question)

In [None]:
result.new_messages()


In [None]:
%pip install pydantic-ai

Evaluation Agents

In [47]:
from pydantic_ai.messages import ModelMessagesTypeAdapter


def log_entry(agent, messages, source="user"):
    tools = []

    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())

    dict_messages = ModelMessagesTypeAdapter.dump_python(messages)

    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": dict_messages,
        "source": source
    }

Create Log

In [48]:
import json
import secrets
from pathlib import Path
from datetime import datetime


LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)


def serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")


def log_interaction_to_file(agent, messages, source='user'):
    entry = log_entry(agent, messages, source)

    ts = entry['messages'][-1]['timestamp']
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)

    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename

    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(entry, f_out, indent=2, default=serializer)

    return filepath

In [None]:
question = input()
result = await agent.run(user_prompt=question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

In [None]:
question = "What was Apple's revenue in Q4 2022"



Adding References

In [50]:
system_prompt = """
You are a helpful assistant for a course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.

Always include references by citing the filename of the source material you used.
When citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/DataTalksClub/faq/blob/main/"
Format: [LINK TITLE](FULL_GITHUB_LINK)

If the search doesn't return relevant results, let the user know and provide general guidance.
""".strip()

# Create another version of agent, let's call it faq_agent_v2
agent = Agent(
    name="faq_agent_v2",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)


LLM as a Judge

In [51]:
evaluation_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met.

Checklist:

- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do
- answer_relevant: The response directly addresses the user's question
- answer_clear: The answer is clear and correct
- answer_citations: The response includes proper citations or sources when required
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked?

Output true/false for each check and provide a short explanation for your judgment.
""".strip()

LLM will produce output that matches this schema exactly.


In [52]:
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str

With Pydantic AI in order to make the output follow the specified class, we use the parameter output_type:


In [53]:
eval_agent = Agent(
    name='eval_agent',
    model='gpt-5-nano',
    instructions=evaluation_prompt,
    output_type=EvaluationChecklist
)


In order to run the agent, it needs input. We'll start with a template:


In [54]:
user_prompt_format = """
<INSTRUCTIONS>{instructions}</INSTRUCTIONS>
<QUESTION>{question}</QUESTION>
<ANSWER>{answer}</ANSWER>
<LOG>{log}</LOG>
""".strip()

In [55]:
def load_log_file(log_file):
    with open(log_file, 'r') as f_in:
        log_data = json.load(f_in)
        log_data['log_file'] = log_file
        return log_data


In [None]:
log_record = load_log_file('./logs/faq_agent_v2_20250926_072928_467470.json')

instructions = log_record['system_prompt']
question = log_record['messages'][0]['parts'][0]['content']
answer = log_record['messages'][-1]['parts'][0]['content']
log = json.dumps(log_record['messages'])

user_prompt = user_prompt_format.format(
    instructions=instructions,
    question=question,
    answer=answer,
    log=log
)


The user input is ready and we can test it!

In [None]:
result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)

checklist = result.output
print(checklist.summary)

for check in checklist.checklist:
    print(check)

Note that we're putting the entire conversation log into the prompt, which is not really necessary. We can reduce it to make it less verbose.


In [57]:
def simplify_log_messages(messages):
    log_simplified = []

    for m in messages:
        parts = []

        for original_part in m['parts']:
            part = original_part.copy()
            kind = part['part_kind']

            if kind == 'user-prompt':
                del part['timestamp']
            if kind == 'tool-call':
                del part['tool_call_id']
            if kind == 'tool-return':
                del part['tool_call_id']
                del part['metadata']
                del part['timestamp']
                # Replace actual search results with placeholder to save tokens
                part['content'] = 'RETURN_RESULTS_REDACTED'
            if kind == 'text':
                del part['id']

            parts.append(part)

        message = {
            'kind': m['kind'],
            'parts': parts
        }

        log_simplified.append(message)
    return log_simplified


In [None]:
async def evaluate_log_record(eval_agent, log_record):
    messages = log_record['messages']

    instructions = log_record['system_prompt']
    question = messages[0]['parts'][0]['content']
    answer = messages[-1]['parts'][0]['content']

    log_simplified = simplify_log_messages(messages)
    log = json.dumps(log_simplified)

    user_prompt = user_prompt_format.format(
        instructions=instructions,
        question=question,
        answer=answer,
        log=log
    )

    result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
    return result.output


log_record = load_log_file('./logs/faq_agent_v2_20250926_072928_467470.json')
eval1 = await evaluate_log_record(eval_agent, log_record)

Data Generation
We can ask AI to help. What if we used it for generating more questions? Let's do that.
We can sample some records from our database. Then for each record, ask an LLM to generate a question based on the record. We use this question as input to our agent and log the answers.
Let’s start by defining the question generator:






In [None]:
question_generation_prompt = """
You are helping to create test questions for an AI agent that answers questions about a data engineering course.

Based on the provided FAQ content, generate realistic questions that students might ask.

The questions should:

- Be natural and varied in style
- Range from simple to complex
- Include both specific technical questions and general course questions

Generate one question for each record.
""".strip()

class QuestionsList(BaseModel):
    questions: list[str]

question_generator = Agent(
    name="question_generator",
    instructions=question_generation_prompt,
    model='gpt-4o-mini',
    output_type=QuestionsList
)


Now we simply iterate over each of the question, ask our agent and log the results:


In [None]:
import random

sample = random.sample(fin_faq, 10)
prompt_docs = [d['content'] for d in sample]
prompt = json.dumps(prompt_docs)

result = await question_generator.run(prompt)
questions = result.output.questions

First, collect all the AI-generated logs for the v2 agent:

In [59]:
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    if 'faq_agent_v2' not in log_file.name:
        continue

    log_record = load_log_file(log_file)
    if log_record['source'] != 'ai-generated':
        continue

    eval_set.append(log_record)

And evaluate them:

In [60]:
eval_results = []

for log_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent, log_record)
    eval_results.append((log_record, eval_result))

0it [00:00, ?it/s]


This code:
Loops through each AI-generated log
Runs our evaluation agent on it
Stores both the original log and evaluation result


The results are collected, but we need to display them and also calculate some statistics. The best tool for doing this is Pandas. We already should have it because minsearch depends on it.


In [61]:
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content'],
    }

    checks = {c.check_name: c.check_pass for c in eval_result.checklist}
    row.update(checks)

    rows.append(row)

This code:
Extracts key information from each log (file, question, answer)
Converts the evaluation checks into a dictionary format


In [62]:
import pandas as pd

df_evals = pd.DataFrame(rows)

In [None]:
df_evals.mean(numeric_only=True)

instructions_follow    0.3
instructions_avoid     1.0
answer_relevant        1.0
answer_clear           1.0
answer_citations       0.3
completeness           0.7
tool_call_search       1.0


This tells us:
Only 30% of responses follow instructions completely
All responses avoid forbidden actions (good!)
All responses are relevant and clear (great!)
Only 30% include proper citations (needs improvement)
70% of responses are complete
All responses use the search tool (as expected)


Evaluating functions and tools


This is how we can implement hitrate and MRR calculation in Python:

In [64]:
def evaluate_search_quality(search_function, test_queries):
    results = []

    for query, expected_docs in test_queries:
        search_results = search_function(query, num_results=5)

        # Calculate hit rate
        relevant_found = any(doc['filename'] in expected_docs for doc in search_results)

        # Calculate MRR
        for i, doc in enumerate(search_results):
            if doc['filename'] in expected_docs:
                mrr = 1 / (i + 1)
                break
        else:
            mrr = 0

        results.append({
            'query': query,
            'hit': relevant_found,
            'mrr': mrr
        })
    return results
