In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

True

### Setup / Grabbing Documents

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

urls = [
    "https://ai-office-hours.beehiiv.com/p/beyond-benchmarks",
    "https://ai-office-hours.beehiiv.com/p/evaluating-ai-agent-tool-selection",
    "https://ai-office-hours.beehiiv.com/p/re-ranking-rag",
    "https://ai-office-hours.beehiiv.com/p/quantizing-llms-llama-3",
    "https://ai-office-hours.beehiiv.com/p/llm-probing"
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [doc for sublist in docs for doc in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs_list)

vector_store = Chroma.from_documents(
    documents=split_docs,
    embedding=OpenAIEmbeddings(),
    collection_name="rag-chroma",
    persist_directory="./chroma_db"
)
retriever = vector_store.as_retriever()

print("loaded docs and stored in chroma vector store")


loaded docs and stored in chroma vector store


In [9]:
question = "tell me about benchmarks"
docs = retriever.invoke(question)

for doc in docs:
    print(doc.page_content[:50], doc.metadata['source'])

top-line conversation starter when evaluating an L https://ai-office-hours.beehiiv.com/p/beyond-benchmarks
(like testing a model’s financial tool selecting a https://ai-office-hours.beehiiv.com/p/beyond-benchmarks
Beyond BenchmarksAI Office HoursLoginSubscribe0AI  https://ai-office-hours.beehiiv.com/p/beyond-benchmarks
fine-tuning process. Test sets in general (benchma https://ai-office-hours.beehiiv.com/p/beyond-benchmarks


### Retrieval Grader

In [12]:
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI


class GradeDocuments(BaseModel):
    binary_score: str = Field(description="A binary score of yes or no indicating if the document answers the question.")
    
    
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

system = """Give a binary score of 'yes' or 'no' indicating if the document answers the question."""

grade_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("user", "Question: {question}\n\n Document: {document}\n")
])

retrieval_grader = grade_prompt | structured_llm_grader
for doc in docs:
    result = retrieval_grader.invoke({
        "question": question,
        "document": doc.page_content
    })
    print(f"binary score: {result.binary_score}")
    print(f"page content: {doc.page_content[:50]}\n")
    print(f"Document Source: {doc.metadata['source']}")


binary score: yes
page content: top-line conversation starter when evaluating an L

Document Source: https://ai-office-hours.beehiiv.com/p/beyond-benchmarks
binary score: yes
page content: (like testing a model’s financial tool selecting a

Document Source: https://ai-office-hours.beehiiv.com/p/beyond-benchmarks
binary score: yes
page content: Beyond BenchmarksAI Office HoursLoginSubscribe0AI 

Document Source: https://ai-office-hours.beehiiv.com/p/beyond-benchmarks
binary score: yes
page content: fine-tuning process. Test sets in general (benchma

Document Source: https://ai-office-hours.beehiiv.com/p/beyond-benchmarks


### Generate Components

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."),
    ("human", "Question: {question}\n\nContext: {context}")
])

for message in prompt.messages:
    print(type(message))
    print(message.prompt.template)
    print("----")

<class 'langchain_core.prompts.chat.SystemMessagePromptTemplate'>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
----
<class 'langchain_core.prompts.chat.HumanMessagePromptTemplate'>
Question: {question}

Context: {context}
----


In [14]:
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0)

def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

rag_chain = prompt | llm | StrOutputParser()

generation = rag_chain.invoke({"context": format_docs(docs), "question": question})
print("Generated Answer:")
print(generation)

Generated Answer:
Benchmarks are standardized open-source test sets used to evaluate AI models on specific tasks, ensuring fair comparison by using agreed-upon train/validation/test splits. They serve as a useful starting point to shortlist models but should not be the sole measure of performance, as models can exploit biases or shortcuts in benchmarks, leading to inflated scores without true understanding. Additionally, over-optimizing for benchmarks can cause models to "game" the system, and high benchmark scores do not necessarily indicate real-world effectiveness or generalization.


### Question Re-write / The corrective Part

In [15]:
bigger_llm = ChatOpenAI(model_name="gpt-4.1", temperature=0.1)

system = """You are a question re-writer that converts an input question to a better version that is optimized \n
     for web search. Look at the input and try to reason about the underlying semantic intent / meaning."""
re_write_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        (
            "human",
            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
        ),
    ]
)

re_writer_chain = re_write_prompt | bigger_llm | StrOutputParser()
improved_question = re_writer_chain.invoke({"question": question})
print("Improved Question:")
print(improved_question)

Improved Question:
What are benchmarks and how are they used to measure performance in different fields?
