# Lab 3.5.6 Solutions: RAG Evaluation with RAGAS

Complete solutions for comprehensive RAG evaluation using RAGAS metrics.

## Setup

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
import numpy as np
import torch
import json

print(f"GPU available: {torch.cuda.is_available()}")

## Exercise 1 Solution: Implement RAGAS Metrics

**Task**: Implement the four core RAGAS metrics from scratch.

In [None]:
@dataclass
class EvaluationSample:
    """A single sample for RAG evaluation."""
    question: str
    ground_truth: str  # Expected answer
    contexts: List[str] = field(default_factory=list)  # Retrieved contexts
    answer: str = ""  # Generated answer
    expected_source: Optional[str] = None  # For retrieval evaluation


@dataclass 
class EvaluationResult:
    """Evaluation results for a sample."""
    question: str
    faithfulness: float  # Is answer grounded in context?
    answer_relevancy: float  # Does answer address question?
    context_precision: float  # Are retrieved contexts relevant?
    context_recall: float  # Do contexts contain needed info?
    
    @property
    def average(self) -> float:
        """Overall score."""
        return (self.faithfulness + self.answer_relevancy + 
                self.context_precision + self.context_recall) / 4
    
    def to_dict(self) -> Dict:
        return {
            "question": self.question,
            "faithfulness": self.faithfulness,
            "answer_relevancy": self.answer_relevancy,
            "context_precision": self.context_precision,
            "context_recall": self.context_recall,
            "average": self.average
        }


class RAGASEvaluator:
    """
    RAGAS-style evaluator using LLM-as-judge.
    
    Four core metrics:
    1. Faithfulness: Is the answer grounded in the context?
    2. Answer Relevancy: Does the answer address the question?
    3. Context Precision: Are the retrieved contexts relevant?
    4. Context Recall: Do contexts contain info for ground truth?
    """
    
    def __init__(self, llm_model: str = "llama3.1:8b"):
        """
        Initialize evaluator with an LLM.
        
        Args:
            llm_model: Ollama model name for evaluation
        """
        self.llm_model = llm_model
        self._verify_llm()
    
    def _verify_llm(self):
        """Verify LLM is available."""
        try:
            import ollama
            ollama.chat(model=self.llm_model, messages=[{"role": "user", "content": "Hi"}])
            print(f"LLM verified: {self.llm_model}")
        except Exception as e:
            print(f"Warning: LLM not available ({e}). Using fallback scoring.")
    
    def _llm_judge(self, prompt: str) -> str:
        """Get LLM judgment."""
        try:
            import ollama
            response = ollama.chat(
                model=self.llm_model,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": 0}  # Deterministic
            )
            return response["message"]["content"].strip()
        except Exception as e:
            return "0.5"  # Fallback
    
    def _parse_score(self, response: str) -> float:
        """Parse numeric score from LLM response."""
        # Try to find a score in the response
        import re
        
        # Look for decimal numbers
        numbers = re.findall(r'\b(0\.\d+|1\.0|0|1)\b', response)
        if numbers:
            return float(numbers[0])
        
        # Look for keywords
        response_lower = response.lower()
        if "yes" in response_lower or "true" in response_lower:
            return 1.0
        if "no" in response_lower or "false" in response_lower:
            return 0.0
        if "partial" in response_lower:
            return 0.5
        
        return 0.5  # Default
    
    def evaluate_faithfulness(self, sample: EvaluationSample) -> float:
        """
        Faithfulness: Are all claims in the answer supported by context?
        
        High faithfulness = No hallucinations
        Low faithfulness = Answer contains unsupported claims
        """
        if not sample.contexts or not sample.answer:
            return 0.0
        
        context_str = "\n---\n".join(sample.contexts[:5])
        
        prompt = f"""Evaluate if the answer is faithfully grounded in the context.

CONTEXT:
{context_str}

ANSWER:
{sample.answer}

Instructions:
1. Identify each factual claim in the answer
2. Check if each claim is supported by the context
3. Score based on percentage of supported claims

Score:
- 1.0: All claims are supported by context
- 0.5: Some claims are supported, some are not
- 0.0: Most claims are not supported (hallucination)

Respond with ONLY a number: 0.0, 0.5, or 1.0"""
        
        return self._parse_score(self._llm_judge(prompt))
    
    def evaluate_answer_relevancy(self, sample: EvaluationSample) -> float:
        """
        Answer Relevancy: Does the answer address what was asked?
        
        High relevancy = Direct, complete answer
        Low relevancy = Off-topic or incomplete
        """
        if not sample.answer:
            return 0.0
        
        prompt = f"""Evaluate if the answer directly addresses the question.

QUESTION: {sample.question}

ANSWER: {sample.answer}

Instructions:
1. Does the answer attempt to address the question?
2. Is the answer complete or partial?
3. Is there irrelevant information?

Score:
- 1.0: Answer fully and directly addresses the question
- 0.5: Answer partially addresses the question
- 0.0: Answer is off-topic or doesn't address the question

Respond with ONLY a number: 0.0, 0.5, or 1.0"""
        
        return self._parse_score(self._llm_judge(prompt))
    
    def evaluate_context_precision(self, sample: EvaluationSample) -> float:
        """
        Context Precision: What fraction of retrieved contexts are relevant?
        
        High precision = All contexts are useful
        Low precision = Many irrelevant contexts retrieved
        """
        if not sample.contexts:
            return 0.0
        
        relevant_count = 0
        total_count = min(5, len(sample.contexts))  # Evaluate top 5
        
        for context in sample.contexts[:total_count]:
            prompt = f"""Is this context relevant to answering the question?

QUESTION: {sample.question}

CONTEXT: {context[:500]}

Respond with ONLY: YES or NO"""
            
            response = self._llm_judge(prompt).upper()
            if "YES" in response:
                relevant_count += 1
        
        return relevant_count / total_count
    
    def evaluate_context_recall(self, sample: EvaluationSample) -> float:
        """
        Context Recall: Do contexts contain info needed for ground truth?
        
        High recall = All needed info is in contexts
        Low recall = Missing important information
        """
        if not sample.contexts or not sample.ground_truth:
            return 0.0
        
        context_str = "\n---\n".join(sample.contexts[:5])
        
        prompt = f"""Do the contexts contain enough information to produce the ground truth answer?

GROUND TRUTH ANSWER: {sample.ground_truth}

CONTEXTS:
{context_str[:2000]}

Instructions:
1. Identify key facts in the ground truth
2. Check if each fact can be found in the contexts
3. Score based on coverage

Score:
- 1.0: All information in ground truth is present in contexts
- 0.5: Some information is present, some is missing
- 0.0: Critical information is missing from contexts

Respond with ONLY a number: 0.0, 0.5, or 1.0"""
        
        return self._parse_score(self._llm_judge(prompt))
    
    def evaluate(self, sample: EvaluationSample) -> EvaluationResult:
        """Run all evaluations on a sample."""
        return EvaluationResult(
            question=sample.question,
            faithfulness=self.evaluate_faithfulness(sample),
            answer_relevancy=self.evaluate_answer_relevancy(sample),
            context_precision=self.evaluate_context_precision(sample),
            context_recall=self.evaluate_context_recall(sample)
        )
    
    def evaluate_batch(
        self,
        samples: List[EvaluationSample],
        verbose: bool = True
    ) -> List[EvaluationResult]:
        """Evaluate multiple samples."""
        results = []
        
        for i, sample in enumerate(samples):
            if verbose:
                print(f"Evaluating {i+1}/{len(samples)}: {sample.question[:50]}...")
            
            result = self.evaluate(sample)
            results.append(result)
            
            if verbose:
                print(f"  Scores: F={result.faithfulness:.2f}, AR={result.answer_relevancy:.2f}, "
                      f"CP={result.context_precision:.2f}, CR={result.context_recall:.2f}")
        
        return results

print("RAGASEvaluator class defined")

## Exercise 2 Solution: Create Test Dataset

**Task**: Build a golden dataset for evaluation.

In [None]:
def create_evaluation_dataset() -> List[EvaluationSample]:
    """
    Create a golden dataset for RAG evaluation.
    
    Each sample includes:
    - Question
    - Ground truth answer
    - Expected source (for retrieval evaluation)
    """
    
    dataset = [
        EvaluationSample(
            question="What is the memory capacity of DGX Spark?",
            ground_truth="DGX Spark has 128GB of unified memory shared between CPU and GPU.",
            expected_source="dgx_spark"
        ),
        EvaluationSample(
            question="How does LoRA reduce the number of trainable parameters?",
            ground_truth="LoRA adds low-rank matrices A and B alongside frozen original weights. Only these small matrices are trained, reducing parameters by 10-100x.",
            expected_source="lora"
        ),
        EvaluationSample(
            question="What is the key innovation of the transformer architecture?",
            ground_truth="The self-attention mechanism that allows each token to attend to all other tokens in the sequence, capturing long-range dependencies.",
            expected_source="transformer"
        ),
        EvaluationSample(
            question="What is NVFP4 quantization?",
            ground_truth="NVFP4 is NVIDIA's 4-bit floating point format that maintains more precision than integer quantization while reducing model size.",
            expected_source="quantization"
        ),
        EvaluationSample(
            question="What are the main components of a RAG system?",
            ground_truth="A RAG system has three main components: a retriever (finds relevant documents), a knowledge base (stores documents as vectors), and a generator (LLM that produces answers).",
            expected_source="rag"
        ),
        EvaluationSample(
            question="How does ChromaDB differ from FAISS?",
            ground_truth="ChromaDB provides built-in persistence, filtering, and a simpler API. FAISS offers superior performance especially with GPU acceleration but lacks built-in persistence.",
            expected_source="vector_database"
        ),
        EvaluationSample(
            question="What is the recommended rank (r) for LoRA fine-tuning?",
            ground_truth="Common values are 8-64. Lower ranks (8-16) for simple tasks, higher ranks (32-64) for complex domain adaptation.",
            expected_source="lora"
        ),
        EvaluationSample(
            question="What is the purpose of positional encoding in transformers?",
            ground_truth="Positional encoding adds information about token positions since self-attention is position-agnostic. It allows the model to understand sequence order.",
            expected_source="transformer"
        )
    ]
    
    return dataset

evaluation_dataset = create_evaluation_dataset()
print(f"Created evaluation dataset with {len(evaluation_dataset)} samples")

# Show samples
for i, sample in enumerate(evaluation_dataset[:3]):
    print(f"\nSample {i+1}:")
    print(f"  Q: {sample.question}")
    print(f"  A: {sample.ground_truth[:100]}...")

## Exercise 3 Solution: Build End-to-End Evaluation Pipeline

**Task**: Combine RAG system with evaluation.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

class EvaluatableRAG:
    """
    RAG system designed for easy evaluation.
    
    Provides access to intermediate outputs:
    - Retrieved contexts
    - Retrieval scores
    - Generated answer
    """
    
    def __init__(
        self,
        documents_path: str = "../data/sample_documents",
        embedding_model_name: str = "BAAI/bge-large-en-v1.5",
        llm_model: str = "llama3.1:8b"
    ):
        self.llm_model = llm_model
        
        # Load and chunk documents
        documents = self._load_documents(documents_path)
        chunks = self._chunk_documents(documents)
        
        # Create vector store
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs={"device": device}
        )
        
        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=self.embedding_model,
            persist_directory="./eval_chroma_db"
        )
        
        print(f"RAG system initialized with {len(chunks)} chunks")
    
    def _load_documents(self, path: str) -> List[Document]:
        documents = []
        for file_path in Path(path).glob("*.md"):
            content = file_path.read_text(encoding='utf-8')
            documents.append(Document(
                page_content=content,
                metadata={"source": file_path.name}
            ))
        return documents
    
    def _chunk_documents(self, documents: List[Document]) -> List[Document]:
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        return splitter.split_documents(documents)
    
    def retrieve(self, query: str, k: int = 5) -> List[Dict]:
        """Retrieve documents with scores."""
        results = self.vectorstore.similarity_search_with_score(query, k=k)
        return [
            {
                "content": doc.page_content,
                "metadata": doc.metadata,
                "score": 1 / (1 + score)  # Convert distance to similarity
            }
            for doc, score in results
        ]
    
    def generate(self, query: str, contexts: List[str]) -> str:
        """Generate answer from contexts."""
        context_str = "\n\n".join(contexts[:5])
        
        prompt = f"""Answer the question based ONLY on the provided context.
If the context doesn't contain the answer, say "I don't have enough information."

Context:
{context_str}

Question: {query}

Answer:"""
        
        try:
            import ollama
            response = ollama.chat(
                model=self.llm_model,
                messages=[{"role": "user", "content": prompt}]
            )
            return response["message"]["content"]
        except Exception as e:
            return f"Error generating response: {e}"
    
    def query(self, question: str, k: int = 5) -> Dict[str, Any]:
        """Complete RAG query with all intermediate outputs."""
        # Retrieve
        retrieved = self.retrieve(question, k=k)
        contexts = [r["content"] for r in retrieved]
        
        # Generate
        answer = self.generate(question, contexts)
        
        return {
            "question": question,
            "contexts": contexts,
            "answer": answer,
            "retrieval_scores": [r["score"] for r in retrieved],
            "sources": [r["metadata"].get("source", "") for r in retrieved]
        }

# Create RAG system
rag = EvaluatableRAG()

In [None]:
def run_evaluation(
    rag: EvaluatableRAG,
    evaluator: RAGASEvaluator,
    dataset: List[EvaluationSample]
) -> Dict[str, Any]:
    """
    Run complete evaluation pipeline.
    """
    # Enrich samples with RAG outputs
    enriched_samples = []
    
    print("Running RAG pipeline on evaluation samples...")
    for sample in dataset:
        result = rag.query(sample.question)
        
        enriched = EvaluationSample(
            question=sample.question,
            ground_truth=sample.ground_truth,
            contexts=result["contexts"],
            answer=result["answer"],
            expected_source=sample.expected_source
        )
        enriched_samples.append(enriched)
    
    # Run evaluation
    print("\nRunning RAGAS evaluation...")
    results = evaluator.evaluate_batch(enriched_samples, verbose=True)
    
    # Aggregate metrics
    metrics = {
        "faithfulness": np.mean([r.faithfulness for r in results]),
        "answer_relevancy": np.mean([r.answer_relevancy for r in results]),
        "context_precision": np.mean([r.context_precision for r in results]),
        "context_recall": np.mean([r.context_recall for r in results]),
        "average": np.mean([r.average for r in results])
    }
    
    return {
        "metrics": metrics,
        "results": results,
        "samples": enriched_samples
    }

# Run evaluation
evaluator = RAGASEvaluator()
eval_output = run_evaluation(rag, evaluator, evaluation_dataset[:4])  # Subset for demo

print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
for metric, value in eval_output["metrics"].items():
    print(f"{metric}: {value:.3f}")

## Exercise 4 Solution: Analyze Results and Identify Issues

**Task**: Deep-dive into evaluation results.

In [None]:
def analyze_evaluation_results(
    results: List[EvaluationResult],
    samples: List[EvaluationSample]
) -> Dict[str, Any]:
    """
    Analyze evaluation results to identify issues.
    """
    analysis = {
        "low_faithfulness": [],  # Hallucination issues
        "low_relevancy": [],  # Answer quality issues
        "low_precision": [],  # Retrieval noise
        "low_recall": [],  # Missing information
        "recommendations": []
    }
    
    threshold = 0.7  # Below this is "low"
    
    for result, sample in zip(results, samples):
        if result.faithfulness < threshold:
            analysis["low_faithfulness"].append({
                "question": sample.question,
                "score": result.faithfulness,
                "answer_preview": sample.answer[:100]
            })
        
        if result.answer_relevancy < threshold:
            analysis["low_relevancy"].append({
                "question": sample.question,
                "score": result.answer_relevancy,
                "answer_preview": sample.answer[:100]
            })
        
        if result.context_precision < threshold:
            analysis["low_precision"].append({
                "question": sample.question,
                "score": result.context_precision
            })
        
        if result.context_recall < threshold:
            analysis["low_recall"].append({
                "question": sample.question,
                "score": result.context_recall
            })
    
    # Generate recommendations
    if len(analysis["low_faithfulness"]) > len(results) * 0.3:
        analysis["recommendations"].append(
            "High hallucination rate. Consider: stricter prompts, better context, smaller temperature."
        )
    
    if len(analysis["low_precision"]) > len(results) * 0.3:
        analysis["recommendations"].append(
            "Low retrieval precision. Consider: better chunking, reranking, or hybrid search."
        )
    
    if len(analysis["low_recall"]) > len(results) * 0.3:
        analysis["recommendations"].append(
            "Low context recall. Consider: increasing k, better embeddings, or query expansion."
        )
    
    return analysis

# Analyze results
analysis = analyze_evaluation_results(
    eval_output["results"],
    eval_output["samples"]
)

print("="*60)
print("EVALUATION ANALYSIS")
print("="*60)

print(f"\nIssues Found:")
print(f"  Low faithfulness: {len(analysis['low_faithfulness'])} samples")
print(f"  Low relevancy: {len(analysis['low_relevancy'])} samples")
print(f"  Low precision: {len(analysis['low_precision'])} samples")
print(f"  Low recall: {len(analysis['low_recall'])} samples")

if analysis["recommendations"]:
    print(f"\nRecommendations:")
    for rec in analysis["recommendations"]:
        print(f"  - {rec}")

## Quality Thresholds

In [None]:
thresholds = """
RAGAS QUALITY THRESHOLDS
========================

Production-Ready (all metrics >= threshold):
-------------------------------------------
  Faithfulness:       >= 0.85  (minimize hallucinations)
  Answer Relevancy:   >= 0.80  (useful answers)
  Context Precision:  >= 0.75  (efficient retrieval)
  Context Recall:     >= 0.80  (complete information)

Acceptable (development/testing):
---------------------------------
  Faithfulness:       >= 0.70
  Answer Relevancy:   >= 0.65
  Context Precision:  >= 0.60
  Context Recall:     >= 0.65

Actions by Metric:
------------------
Low Faithfulness:
  - Add "only use provided context" to prompts
  - Reduce temperature
  - Add citation requirements

Low Answer Relevancy:
  - Improve prompt clarity
  - Add examples to prompt
  - Consider query rewriting

Low Context Precision:
  - Add reranking stage
  - Improve chunking strategy
  - Use hybrid search

Low Context Recall:
  - Increase retrieval k
  - Use query expansion
  - Improve embedding model
"""

print(thresholds)