🔧 **Setup Required**: Before running this notebook, please follow the [setup instructions](../README.md#setup-instructions) to configure your environment and API keys. **You will need to ensure you've executed the Indexing pipeline before completing this exercise**

In [1]:
import pandas as pd
from pathlib import Path
from haystack import component, Pipeline
from typing import List, Optional, Dict, Any, Union

@component
class CSVReaderComponent:
    """Reads a CSV file into a Pandas DataFrame."""

    @component.output_types(data_frame=pd.DataFrame)
    def run(self, source: Union[str, Path]):
        """
        Reads the CSV file from the first source in the list.
        
        Args:
            sources: List of file paths to CSV files. Only the first file will be processed.
            
        Returns:
            dict: Dictionary containing the loaded DataFrame under 'data_frame' key.
            
        Raises:
            FileNotFoundError: If the file doesn't exist or can't be read.
            ValueError: If the DataFrame is empty after loading.
        """
        if not source:
            raise ValueError("No sources provided")
            

        try:
            df = pd.read_csv(source)
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found at {source}")
        except Exception as e:
            raise ValueError(f"Error reading CSV file {source}: {str(e)}")

        # Check if DataFrame is empty using proper pandas method
        if df.empty:
            raise ValueError(f"DataFrame is empty after loading from {source}")

        print(f"Loaded DataFrame with {len(df)} rows from {source}.")
        return {"data_frame": df}

In [2]:
from haystack import SuperComponent

@component
class RAGDataAugmenterComponent:
    """
    Applies a RAG SuperComponent to each query in a DataFrame and 
    augments the data with the generated answer and retrieved contexts.
    """

    def __init__(self, rag_supercomponent: SuperComponent):
        # We store the pre-initialized SuperComponent
        self.rag_supercomponent = rag_supercomponent
        self.output_names = ["augmented_data_frame"]

    @component.output_types(augmented_data_frame=pd.DataFrame)
    def run(self, data_frame: pd.DataFrame):
        
        # New columns to store RAG results
        answers: List[str] = []
        contexts: List[List[str]] = []

        print(f"Running RAG SuperComponent on {len(data_frame)} queries...")

        # Iterate through the queries (user_input column)
        for _, row in data_frame.iterrows():
            query = row["user_input"]
            
            # 1. Run the RAG SuperComponent
            # It expects 'query' as input and returns a dictionary.
            rag_output = self.rag_supercomponent.run(query=query)
            
            # 2. Extract answer and contexts
            # Based on the naive_rag_sc/hybrid_rag_sc structure:
            answer = rag_output.get('replies', [''])[0]
            
            # Extract content from the Document objects
            retrieved_docs = rag_output.get('documents', [])
            retrieved_contexts = [doc.content for doc in retrieved_docs]
            
            answers.append(answer)
            contexts.append(retrieved_contexts)
        
        # 3. Augment the DataFrame
        data_frame['response'] = answers
        data_frame['retrieved_contexts'] = contexts
        
        print("RAG processing complete.")
        return {"augmented_data_frame": data_frame}

In [3]:
from ragas import EvaluationDataset, evaluate
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity

from ragas.llms import llm_factory
from haystack.utils import Secret
import os
from ragas.llms import HaystackLLMWrapper
from haystack.components.generators import OpenAIGenerator

# Note: Ensure ragas and its dependencies (like litellm or openai) are installed
@component
class RagasEvaluationComponent:
    """
    Prepares data for Ragas, runs the evaluation, and returns the metrics.
    """
    
    def __init__(self, 
                 metrics: Optional[List[Any]] = None,
                 ragas_llm: Optional[Any] = None):
        
        # Default metrics for RAG evaluation
        self.metrics = metrics
        
        # Ragas requires an LLM for evaluation, often provided through OpenAI or Anthropic.
        # It's best practice to use a strong model like gpt-4o-mini or gpt-4.
        if ragas_llm is None:
            # Assumes OPENAI_API_KEY is set in the environment
            self.ragas_llm = HaystackLLMWrapper(OpenAIGenerator(model="gpt-4o-mini",
                                                               api_key=Secret.from_env_var("OPENAI_API_KEY")))
        else:
            self.ragas_llm = ragas_llm

    @component.output_types(metrics=Dict[str, float], evaluation_df=pd.DataFrame)
    def run(self, augmented_data_frame: pd.DataFrame):
        
        # 1. Map columns to Ragas requirements - correct column mapping for SingleTurnSample
        ragas_data = pd.DataFrame({
            'user_input': augmented_data_frame['user_input'],
            'response': augmented_data_frame['response'], 
            'retrieved_contexts': augmented_data_frame['retrieved_contexts'],
            'reference': augmented_data_frame['reference'],
            'reference_contexts': augmented_data_frame['reference_contexts'].apply(eval)
        })

        print("Creating Ragas EvaluationDataset...")
        # 2. Create EvaluationDataset using from_pandas which handles the format correctly
        dataset = EvaluationDataset.from_pandas(ragas_data)

        print("Starting Ragas evaluation...")
        
        # 3. Run Ragas Evaluation
        # Pass the configured LLM to Ragas
        results = evaluate(
            dataset=dataset,
            metrics=self.metrics,
            llm=self.ragas_llm
        )
        

        results_df = results.to_pandas()
        
        print("Ragas evaluation complete.")
        print(f"Overall metrics: {results}")
        
        return {"metrics": results, "evaluation_df": results_df}

  from .autonotebook import tqdm as notebook_tqdm


Naive RAG evaluation

In [4]:
# --- Setup Environment & Dependencies ---
# You need to ensure:
# 1. Elasticsearch is running (as NaiveRAG/HybridRAG rely on it, see files).
# 2. OPENAI_API_KEY is set in your environment.
# 3. The document store has been indexed with your data.

# --- 1. Import RAG SuperComponents ---
# Assuming naiverag.py and hybridrag.py are in your environment
from scripts.rag.naiverag import naive_rag_sc
from scripts.rag.hybridrag import hybrid_rag_sc
from pathlib import Path

# --- 2. Define Configurations to Test ---

# The RAG SuperComponent to test (change this to swap RAG configurations)
rag_sc_to_test = naive_rag_sc # OR hybrid_rag_sc

# If you want to test different internal configurations (e.g., chunk size, embedder model), 
# you should create and index new SuperComponents with those changes 
# and then choose the appropriate object here.

# --- 3. Instantiate Custom Components ---

metrics = [LLMContextRecall(), \
                Faithfulness(), \
                FactualCorrectness(), \
                ResponseRelevancy(), \
                ContextEntityRecall(), \
                NoiseSensitivity()]


reader = CSVReaderComponent()
augmenter = RAGDataAugmenterComponent(rag_supercomponent=rag_sc_to_test)
evaluator = RagasEvaluationComponent(metrics=metrics)

# --- 4. Build the Evaluation Pipeline ---

evaluation_pipeline = Pipeline()

evaluation_pipeline.add_component("reader", reader)
evaluation_pipeline.add_component("augmenter", augmenter)
evaluation_pipeline.add_component("evaluator", evaluator)

# Connect the flow: CSV -> Augment -> Evaluate
evaluation_pipeline.connect("reader.data_frame", "augmenter.data_frame")
evaluation_pipeline.connect("augmenter.augmented_data_frame", "evaluator.augmented_data_frame")



<haystack.core.pipeline.pipeline.Pipeline object at 0x169eabf20>
🚅 Components
  - reader: CSVReaderComponent
  - augmenter: RAGDataAugmenterComponent
  - evaluator: RagasEvaluationComponent
🛤️ Connections
  - reader.data_frame -> augmenter.data_frame (DataFrame)
  - augmenter.augmented_data_frame -> evaluator.augmented_data_frame (DataFrame)

In [5]:

# --- 5. Run the Evaluation Pipeline ---
csv_file_path = "data_for_eval/synthetic_tests_advanced_branching_3.csv"
print(f"Starting evaluation of {rag_sc_to_test.__class__.__name__}...")

results = evaluation_pipeline.run({"reader": {"source": csv_file_path}})


Starting evaluation of SuperComponent...
Loaded DataFrame with 4 rows from data_for_eval/synthetic_tests_advanced_branching_3.csv.
Running RAG SuperComponent on 4 queries...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.50it/s]


RAG processing complete.
Creating Ragas EvaluationDataset...
Starting Ragas evaluation...


Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   4%|▍         | 1/24 [00:02<00:59,  2.60s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  17%|█▋        | 4/24 [00:08<00:38,  1.92s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 24/24 [01:07<00:00,  2.82s/it]


Ragas evaluation complete.
Overall metrics: {'context_recall': 1.0000, 'faithfulness': 0.4500, 'factual_correctness(mode=f1)': 0.1875, 'answer_relevancy': 0.2355, 'context_entity_recall': 0.1833, 'noise_sensitivity(mode=relevant)': 0.1333}


In [18]:

# --- 6. Access Metrics ---
final_metrics = results


In [19]:
final_metrics['evaluator']['evaluation_df']

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,context_recall,faithfulness,factual_correctness(mode=f1),answer_relevancy,context_entity_recall,noise_sensitivity(mode=relevant)
0,How is Amazon utilizing AI technology in its p...,"[What is AI, how does it work and why are some...","[What is AI, how does it work and why are some...",The provided documents do not contain specific...,Amazon's AI technology is prominently featured...,1.0,1.0,0.0,0.0,0.333333,1.0
1,What percentage of ChatGPT queries related to ...,[Sampling details available in Section 3.\n5.2...,[<1-hop>\n\n5.1 What share of ChatGPT queries ...,"As of June 2025, the percentage of ChatGPT que...",The LLM classifier identified that 53% of mess...,1.0,0.8,0.86,0.941918,0.166667,0.0
2,How is ChatGPT Business used in different occu...,[X’s indicate that the ranking is\nunavailable...,[<1-hop>\n\nCorporate users may also use ChatG...,The provided information does not specify deta...,ChatGPT Business is utilized across various oc...,0.666667,0.75,0.17,0.0,0.0,0.0
3,"How does the adoption of generative AI, partic...","[What is AI, how does it work and why are some...",[<1-hop>\n\nCorporate users may also use ChatG...,The provided documents do not specifically add...,"The adoption of generative AI, such as ChatGPT...",1.0,0.333333,0.0,0.0,0.111111,0.333333


In [11]:
final_metrics['evaluator']['metrics']

{'context_recall': 1.0000, 'faithfulness': 0.4500, 'factual_correctness(mode=f1)': 0.1875, 'answer_relevancy': 0.2355, 'context_entity_recall': 0.1833, 'noise_sensitivity(mode=relevant)': 0.1333}

Hybrid RAG evaluation

In [None]:
rag_sc_to_test = hybrid_rag_sc
metrics = [LLMContextRecall(), \
                Faithfulness(), \
                FactualCorrectness(), \
                ResponseRelevancy(), \
                ContextEntityRecall(), \
                NoiseSensitivity()]


reader = CSVReaderComponent()
augmenter = RAGDataAugmenterComponent(rag_supercomponent=rag_sc_to_test)
evaluator = RagasEvaluationComponent(metrics=metrics)

# --- 4. Build the Evaluation Pipeline ---

evaluation_pipeline = Pipeline()

evaluation_pipeline.add_component("reader", reader)
evaluation_pipeline.add_component("augmenter", augmenter)
evaluation_pipeline.add_component("evaluator", evaluator)

# Connect the flow: CSV -> Augment -> Evaluate
evaluation_pipeline.connect("reader.data_frame", "augmenter.data_frame")
evaluation_pipeline.connect("augmenter.augmented_data_frame", "evaluator.augmented_data_frame")

<haystack.core.pipeline.pipeline.Pipeline object at 0x35cfaa030>
🚅 Components
  - reader: CSVReaderComponent
  - augmenter: RAGDataAugmenterComponent
  - evaluator: RagasEvaluationComponent
🛤️ Connections
  - reader.data_frame -> augmenter.data_frame (DataFrame)
  - augmenter.augmented_data_frame -> evaluator.augmented_data_frame (DataFrame)

In [15]:
# --- 5. Run the Evaluation Pipeline ---
csv_file_path = "data_for_eval/synthetic_tests_advanced_branching_3.csv"
print(f"Starting evaluation of {rag_sc_to_test.__class__.__name__}...")

results = evaluation_pipeline.run({"reader": {"source": csv_file_path}})


Starting evaluation of SuperComponent...
Loaded DataFrame with 4 rows from data_for_eval/synthetic_tests_advanced_branching_3.csv.
Running RAG SuperComponent on 4 queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 15.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.08it/s]


RAG processing complete.
Creating Ragas EvaluationDataset...
Starting Ragas evaluation...


Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   4%|▍         | 1/24 [00:02<00:48,  2.13s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:   8%|▊         | 2/24 [00:04<00:54,  2.49s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  29%|██▉       | 7/24 [00:09<00:16,  1.03it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating: 100%|██████████| 24/24 [01:01<00:00,  2.55s/it]


Ragas evaluation complete.
Overall metrics: {'context_recall': 0.9167, 'faithfulness': 0.7208, 'factual_correctness(mode=f1)': 0.2575, 'answer_relevancy': 0.2355, 'context_entity_recall': 0.1528, 'noise_sensitivity(mode=relevant)': 0.3333}


In [16]:
results['evaluator']['evaluation_df']

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,context_recall,faithfulness,factual_correctness(mode=f1),answer_relevancy,context_entity_recall,noise_sensitivity(mode=relevant)
0,How is Amazon utilizing AI technology in its p...,"[What is AI, how does it work and why are some...","[What is AI, how does it work and why are some...",The provided documents do not contain specific...,Amazon's AI technology is prominently featured...,1.0,1.0,0.0,0.0,0.333333,1.0
1,What percentage of ChatGPT queries related to ...,[Sampling details available in Section 3.\n5.2...,[<1-hop>\n\n5.1 What share of ChatGPT queries ...,"As of June 2025, the percentage of ChatGPT que...",The LLM classifier identified that 53% of mess...,1.0,0.8,0.86,0.941918,0.166667,0.0
2,How is ChatGPT Business used in different occu...,[X’s indicate that the ranking is\nunavailable...,[<1-hop>\n\nCorporate users may also use ChatG...,The provided information does not specify deta...,ChatGPT Business is utilized across various oc...,0.666667,0.75,0.17,0.0,0.0,0.0
3,"How does the adoption of generative AI, partic...","[What is AI, how does it work and why are some...",[<1-hop>\n\nCorporate users may also use ChatG...,The provided documents do not specifically add...,"The adoption of generative AI, such as ChatGPT...",1.0,0.333333,0.0,0.0,0.111111,0.333333


In [17]:
final_metrics['evaluator']['metrics']

{'context_recall': 1.0000, 'faithfulness': 0.4500, 'factual_correctness(mode=f1)': 0.1875, 'answer_relevancy': 0.2355, 'context_entity_recall': 0.1833, 'noise_sensitivity(mode=relevant)': 0.1333}