In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from pymilvus import MilvusClient, connections
from tqdm.auto import tqdm
import pandas as pd
from datasets import Dataset
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, answer_relevancy, context_precision
from ragas import evaluate
import google.generativeai as genai
import os
import dotenv

  from .autonotebook import tqdm as notebook_tqdm


# Get RAG Evaluation Dataset

In [4]:
csv_path = "ragas_evaluation_dataset.csv"
queries = pd.read_csv(csv_path)
print(queries.head())

   id                                           question  \
0   1  What are the three main evaluation metrics use...   
1   2  What type of model does ARES fine-tune to act ...   
2   3  What is the role of Prediction-Powered Inferen...   
3   4  Who introduced the concept of Knowledge Distil...   
4   5   What is the main goal of Knowledge Distillation?   

                                              answer  
0  ARES evaluates RAG systems based on context re...  
1  ARES fine-tunes DeBERTa-v3-Large models as LLM...  
2  Prediction-Powered Inference provides confiden...  
3  Knowledge Distillation was formally introduced...  
4  The goal of Knowledge Distillation is to trans...  


# Load environment variables and dataset

In [6]:
dotenv.load_dotenv()

# Prepare lists to store results
generated_answers = []
retrieved_contexts = []
questions = []
ground_truths = []

# Initialize the embedding model and Milvus client
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
client = MilvusClient()
connections.connect()

# Initialize the LLM
google_api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=google_api_key)
gemini_model = genai.GenerativeModel('gemini-2.5-pro')

# Run queries and store results

In [8]:
for index, row in tqdm(queries.iterrows(), total=len(queries)):
    question = row['question']
    ground_truth = row['answer']
    query_embedding = embedding_model.embed_query(question)
    search_results = client.search(
        collection_name="learning_portal",
        data=[query_embedding],
        limit=5,
        output_fields=["passage"]
    )
    context = "\n".join([hit['entity']['passage'] for hit in search_results[0]])

    prompt = f"""
        You are a personalized learning assistant. Your goal is to provide a clear and comprehensive answer to the user's question.

        **User's Question:**
        {question}

        **Here is some context retrieved from the learning materials:**
        <retrieved_context>
        {context}
        </retrieved_context>


        **Instructions:**
        1. Synthesize the information from the retrieved context, conversation history, and long-term memory to formulate your answer.
        2. If the provided context is insufficient or the question requires very recent information, use your built-in Google Search tool to find the most up-to-date facts.
        3. Provide a direct and helpful answer. Cite the source of your information if it comes from an external search.
        """
    try:
        response = gemini_model.generate_content(prompt)
        answer = response.text
    except Exception as e:
        answer = f"Error generating response: {e}"
    questions.append(question)
    generated_answers.append(answer)
    retrieved_contexts.append([context])  # Ragas expects list of lists
    ground_truths.append(ground_truth)

rag_results = pd.DataFrame({
    "question": questions,
    "answer": generated_answers,
    "contexts": retrieved_contexts,
    "ground_truths": ground_truths
})

data = {
    "user_input": rag_results["question"].tolist(),
    "response": rag_results["answer"].tolist(),
    "retrieved_contexts": rag_results["contexts"].tolist(),
    "reference": rag_results["ground_truths"].tolist()
}

dataset = Dataset.from_dict(data)

100%|██████████| 25/25 [05:37<00:00, 13.49s/it]


# Run RAGAS evaluation

In [9]:
openai_api_key =  os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
openai_llm = ChatOpenAI(model="gpt-4o-mini", api_key=openai_api_key)
evaluator_llm = LangchainLLMWrapper(openai_llm)

result = evaluate(
    dataset=dataset,
    metrics=[
        Faithfulness(),
        answer_relevancy,
        LLMContextRecall(),
        context_precision,
        FactualCorrectness()
    ],
    llm=evaluator_llm,
    embeddings=embeddings
)

print(result)

  evaluator_llm = LangchainLLMWrapper(openai_llm)
Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   1%|          | 1/125 [00:08<18:20,  8.87s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   8%|▊         | 10/125 [00:54<07:39,  4.00s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  13%|█▎        | 16/125 [01:39<10:19,  5.68s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  19%|█▉        | 24/125 [01:50<05:31,  3.28s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  31%|███       | 39/125 [02:47<04:07,  2.87s/it]LLM returned 1 generations

{'faithfulness': 0.5634, 'answer_relevancy': 0.9645, 'context_recall': 0.7567, 'context_precision': 0.8400, 'factual_correctness(mode=f1)': 0.5084}
