ragas==0.1.9

In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

app_dir = os.path.join(os.getcwd(), "app")
load_dotenv(os.path.join(app_dir, ".env"))

loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(docs)

ModuleNotFoundError: No module named 'langchain_community'

In [2]:
chunks[0]

Document(page_content='margherita pizza; $12; classic with tomato, mozzarella, and basil; main dish\n\nspaghetti carbonara; $15; creamy pasta with pancetta and parmesan; main dish\n\nbruschetta; $8; toasted bread with tomato, garlic, and olive oil; appetizer\n\ncaprese salad; $10; fresh tomatoes, mozzarella, and basil; salad', metadata={'source': 'data\\food.txt'})

In [3]:
# RAGAS expects a file_name dict as key
for document in chunks:
    document.metadata["file_name"] = document.metadata["source"]

In [4]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

embeddings = OpenAIEmbeddings()
model = ChatOpenAI()

generator = TestsetGenerator.from_langchain(
    embeddings=embeddings, generator_llm=model, critic_llm=model
)

testset = generator.generate_with_langchain_docs(
    chunks,
    test_size=8,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

  from .autonotebook import tqdm as notebook_tqdm
Filename and doc_id are the same for all nodes.                 
Generating:  88%|████████▊ | 7/8 [00:48<00:09,  9.03s/it]max retries exceeded for ReasoningEvolution(generator_llm=LangchainLLMWrapper(run_config=RunConfig(timeout=60, max_retries=15, max_wait=90, max_workers=16, thread_timeout=80.0, exception_types=<class 'openai.RateLimitError'>, log_tenacity=False)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x0000023059DCDF10>, nodes=[Node(page_content='margherita pizza; $12; classic with tomato, mozzarella, and basil; main dish\n\nspaghetti carbonara; $15; creamy pasta with pancetta and parmesan; main dish\n\nbruschetta; $8; toasted bread with tomato, garlic, and olive oil; appetizer\n\ncaprese salad; $10; fresh tomatoes, mozzarella, and basil; salad', metadata={'source': 'data\\food.txt', 'file_name': 'data\\food.txt'}, doc_id='cbd0c2b5-ab37-416e-8b56-f5b4a31bcf9b'), Node(page_

In [5]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How has Chef Amico's lifetime of love influenc...,[Amico’s legacy is not just in the dishes he c...,Chef Amico's lifetime of love has influenced h...,simple,"[{'source': 'data\founder.txt', 'file_name': '...",True
1,How does Chef Amico's story highlight the powe...,[Amico’s legacy is not just in the dishes he c...,Chef Amico's story highlights the power of foo...,simple,"[{'source': 'data\founder.txt', 'file_name': '...",True
2,What are the ingredients in a Negroni cocktail?,[panna cotta; $9; creamy italian dessert with ...,"The Negroni cocktail typically contains gin, v...",simple,"[{'source': 'data\food.txt', 'file_name': 'dat...",True
3,How do the flavors of Sicily play a role in Ch...,[Amico’s legacy is not just in the dishes he c...,The flavors of Sicily play a significant role ...,simple,"[{'source': 'data\founder.txt', 'file_name': '...",True
4,What's the price of the tortellini if ribollit...,[ribollita; $10; tuscan bread and vegetable so...,$14,reasoning,"[{'source': 'data\food.txt', 'file_name': 'dat...",True
5,How does hospitality contribute to community a...,"[For Amico, hospitality was an art form. He be...",Hospitality at Amico's contributes to the comm...,multi_context,"[{'source': 'data\founder.txt', 'file_name': '...",True
6,How does Chef Amico's culinary influence exten...,"[Continuing the Legacy\n\nToday, Chef Amico st...",Chef Amico's culinary influence extends beyond...,multi_context,"[{'source': 'data\founder.txt', 'file_name': '...",True


In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings

from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI

embedding = OpenAIEmbeddings()
model = ChatOpenAI()

vectorstore = Chroma.from_documents(chunks, embedding)
retriever = vectorstore.as_retriever()

In [4]:
from langchain_core.prompts import PromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
# questions = testset.to_pandas()["question"].to_list()
# ground_truth = testset.to_pandas()["ground_truth"].to_list()

import pandas as pd

df = pd.read_csv("./questions_answers/qa.csv", delimiter=";")
questions = df["question"].tolist()
ground_truth = df["ground_truth"].tolist()

In [None]:
ground_truth

In [None]:
from datasets import Dataset

data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

for query in questions:
    print(f"Query: {query}")
    data["question"].append(query)
    data["answer"].append(rag_chain.invoke(query))
    data["contexts"].append([doc.page_content for doc in retriever.invoke(query)])

dataset = Dataset.from_dict(data)

In [None]:
first_entry = {
    "question": data["question"][0],
    "answer": data["answer"][0],
    "contexts": data["contexts"][0],
    "ground_truth": data["ground_truth"][0],
}
first_entry

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset=dataset,
    metrics=[
        context_relevancy,
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    llm=model,
    embeddings=embedding,
)

In [None]:
result.to_pandas()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

df = result.to_pandas()

heatmap_data = df[
    [
        "context_relevancy",
        "context_precision",
        "context_recall",
        "faithfulness",
        "answer_relevancy",
    ]
]

cmap = LinearSegmentedColormap.from_list("green_red", ["red", "green"])

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".2f", linewidths=0.5, cmap=cmap)

plt.yticks(ticks=range(len(df["question"])), labels=df["question"], rotation=0)

plt.show()