## Evaluation of RAG systems

In [1]:
! pip install "langchain>=0.3.5,<0.4" \
            "langchain-openai>=0.3.0" \
            "langchain-community>=0.3.0" \
            "langfuse>=3.0.0"
! pip install datasets ragas python-dotenv openai chromadb

Collecting langchain-openai>=0.3.0
  Downloading langchain_openai-1.0.1-py3-none-any.whl.metadata (1.8 kB)
Collecting langchain-community>=0.3.0
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langfuse>=3.0.0
  Downloading langfuse-3.8.1-py3-none-any.whl.metadata (2.4 kB)
INFO: pip is looking at multiple versions of langchain-openai to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-openai>=0.3.0
  Downloading langchain_openai-1.0.0-py3-none-any.whl.metadata (1.8 kB)
  Downloading langchain_openai-0.3.35-py3-none-any.whl.metadata (2.4 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community>=0.3.0
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting re

In [4]:
import os

os.environ["LANGFUSE_TRACING"] = "true"
os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com"
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [6]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json
Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


  vs.persist()
  vs = Chroma(


## Let's try manual RAG evaluation

In [7]:
from langfuse.langchain import CallbackHandler
from langchain.chat_models import init_chat_model
from langchain_core.tools import create_retriever_tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langfuse import get_client

langfuse = get_client()
langfuse_handler = CallbackHandler()

# assumes you already have `vs`
retriever = vs.as_retriever(search_kwargs={"k": 3})
retrieve_tool = create_retriever_tool(
    retriever=retriever,
    name="kb_search",
    description="Search internal KB for relevant passages."
)

system_prompt = """
Retrieve relevant information from the internal knowledge base and then answer the question.
If there is nothing relevant in the knowledge base, just say "I don't know".
Always consider using the tool `kb_search` first.
""".strip()

model = init_chat_model("openai:gpt-4o-mini", temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
    MessagesPlaceholder("agent_scratchpad"),
])

agent = create_tool_calling_agent(model, [retrieve_tool], prompt)
agent_executor = AgentExecutor(agent=agent, tools=[retrieve_tool], handle_parsing_errors=True)

result = agent_executor.invoke(
    {"input": "What is the main product of KapitalBank?", "chat_history": []},
    config={"callbacks": [langfuse_handler], "metadata": {"feature": "rag"}}
)
print(result["output"])



The main product of Kapital Bank includes a variety of financial services, with a significant focus on consumer loans and banking services. They also offer products like the Birbank installment card. For more detailed information about their products and services, you can visit their website at [kapitalbank.az](https://kapitalbank.az).


## RAGAS evaluation

In [9]:
from langchain_openai import ChatOpenAI
from ragas.embeddings import OpenAIEmbeddings
import openai

llm = ChatOpenAI(model="gpt-4o")
openai_client = openai.OpenAI()
embeddings = OpenAIEmbeddings(client=openai_client)

import numpy as np

class RAG:
    def __init__(self, model="gpt-4o"):
        import openai
        self.llm = ChatOpenAI(model=model)
        openai_client = openai.OpenAI()
        self.embeddings = OpenAIEmbeddings(client=openai_client)
        self.doc_embeddings = None
        self.docs = None

    def load_documents(self, documents):
        """Load documents and compute their embeddings."""
        self.docs = documents
        self.doc_embeddings = self.embeddings.embed_texts(documents)

    def get_most_relevant_docs(self, query):
        """Find the most relevant document for a given query."""
        if not self.docs or not self.doc_embeddings:
            raise ValueError("Documents and their embeddings are not loaded.")

        query_embedding = self.embeddings.embed_text(query)
        similarities = [
            np.dot(query_embedding, doc_emb)
            / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
            for doc_emb in self.doc_embeddings
        ]
        most_relevant_doc_index = np.argmax(similarities)
        return [self.docs[most_relevant_doc_index]]

    def generate_answer(self, query, relevant_doc):
        """Generate an answer for a given query based on the most relevant document."""
        prompt = f"question: {query}\n\nDocuments: {relevant_doc}"
        messages = [
            ("system", "You are a helpful assistant that answers questions based on given documents only."),
            ("human", prompt),
        ]
        ai_msg = self.llm.invoke(messages)
        return ai_msg.content

In [10]:
import json

sample_docs = [x['text'] for x in json.load(open("kapitalbank_pages.json"))]

# Initialize RAG instance
rag = RAG()

# Load documents
rag.load_documents(sample_docs)

# Query and retrieve the most relevant document
query = "Who introduced the theory of relativity?"
relevant_doc = rag.get_most_relevant_docs(query)

# Generate an answer
answer = rag.generate_answer(query, relevant_doc)

print(f"Query: {query}")
print(f"Relevant Document: {relevant_doc}")
print(f"Answer: {answer}")

Query: Who introduced the theory of relativity?
Relevant Document: ["Currency rates Currency is provided according to Azerbaijan Republic Central Bank's for today. Rates may differ on some branches. 02.11.2025 currency_date_selector Currency Central Bank Buy Sell Cash/Birbank current account Non-cash Buy PC Cash/Birbank current account Non-cash Sell PC USD 1.7000 1.6970 1.6990 1.6950 1.7020 1.7200 1.7200 EUR 1.9672 1.9272 1.9222 1.9380 2.0072 2.0572 2.0010 100 RUB 2.1247 1.9200 1.7100 0.0000 2.2800 2.4800 0.0000 GBP 2.2367 2.1717 2.1917 2.2030 2.2817 2.2967 2.2640 CHF 2.1204 2.0704 2.0704 0.0000 2.1804 2.1804 0.0000 TRY 0.0404 0.0222 0.0272 0.0000 0.0522 0.0522 0.0000 AED 0.4628 0.3700 0.4500 0.0000 0.4660 0.4660 0.0000 CNY 0.2391 0.0000 0.1891 0.0000 0.0000 0.2891 0.0000 Currency Buy Sell USD 1.6970 1.7020 EUR 1.9272 2.0072 100 RUB 1.9200 2.2800 GBP 2.1717 2.2817 CHF 2.0704 2.1804 TRY 0.0222 0.0522 AED 0.3700 0.4660 CNY 0.0000 0.0000 I am selling I am buying AZN USD EUR RUB GBP CHF TR

In [11]:
from openai import OpenAI
import pandas as pd

client = OpenAI()

def generate_banking_qa_pairs(num_pairs=20):
    questions = []
    answers = []
    for _ in range(num_pairs):
        # Generate question
        question_completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a helpful customer service chatbot for Kapitalbank in Azerbaijan. "
                        "Please generate a short, realistic question from a customer."
                    )
                }
            ],
            temperature=1
        )
        question_text = question_completion.choices[0].message.content.strip()

        # Now generate an answer to that question
        answer_completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a knowledgeable Kapitalbank customer support assistant providing clear and concise answers to customer queries."
                    )
                },
                {
                    "role": "user",
                    "content": question_text
                }
            ],
            temperature=0.7
        )
        answer_text = answer_completion.choices[0].message.content.strip()

        questions.append(question_text)
        answers.append(answer_text)
    return questions, answers

# Generate Q&A pairs
banking_questions, banking_answers = generate_banking_qa_pairs(num_pairs=10)

# Convert to a Pandas DataFrame
df = pd.DataFrame({"Question": banking_questions, "Answer": banking_answers})
df.to_csv("kapitalbank_qa_pairs.csv", index=False)

In [12]:
import pandas as pd
from ragas import EvaluationDataset

qa_data = pd.read_csv("kapitalbank_qa_pairs.csv")

sample_queries = qa_data['Question'].tolist()
expected_responses = qa_data['Answer'].tolist()

dataset = []
for query,reference in zip(sample_queries,expected_responses):

    relevant_docs = rag.get_most_relevant_docs(query)
    response = rag.generate_answer(query, relevant_docs)
    dataset.append(
        {
            "user_input":query,
            "retrieved_contexts":relevant_docs,
            "response":response,
            "reference":reference
        }
    )

evaluation_dataset = EvaluationDataset.from_list(dataset)

In [13]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, LLMContextPrecisionWithoutReference

evaluator_llm = LangchainLLMWrapper(llm)
metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), LLMContextPrecisionWithoutReference()]

result = evaluate(
    dataset=evaluation_dataset,
    metrics = metrics,
    llm=evaluator_llm
)
result

  evaluator_llm = LangchainLLMWrapper(llm)


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]



{'context_recall': 0.0533, 'faithfulness': 0.4917, 'factual_correctness(mode=f1)': 0.4280, 'answer_relevancy': 0.3721, 'llm_context_precision_without_reference': 0.1000}