<a href="https://colab.research.google.com/github/Sayali19-cell/RAG-Evaluation/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import logging
from sklearn.metrics import precision_score
from nltk.translate.bleu_score import sentence_bleu

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("RAG Evaluation")

ground_truth_docs = {"query1": "This is a relevant document.", "query2": "Another important document."}
retrieved_docs = {"query1": "This is a relevant document.", "query2": "An unrelated document."}

ground_truth_responses = {"query1": "The capital of France is Paris.", "query2": "Water boils at 100 degrees Celsius."}
generated_responses = {"query1": "The capital of France is Paris.", "query2": "Water boils at 150 degrees Celsius."}

# 1. Retrieval Evaluation (Precision)
def evaluate_retrieval(ground_truth_docs, retrieved_docs):
    """
    Evaluates retrieval precision.
    """
    true_positive = sum(1 for q in ground_truth_docs if ground_truth_docs[q] == retrieved_docs[q])
    total_retrieved = len(retrieved_docs)
    precision = true_positive / total_retrieved if total_retrieved > 0 else 0
    logger.info(f"Retrieval Precision: {precision:.2f}")
    return precision

# 2. Generation Evaluation (BLEU Score)
def evaluate_generation(ground_truth_responses, generated_responses):
    """
    Evaluates generation quality using BLEU scores.
    """
    bleu_scores = []
    for query, true_response in ground_truth_responses.items():
        generated = generated_responses.get(query, "")
        bleu = sentence_bleu([true_response.split()], generated.split())
        bleu_scores.append(bleu)
        logger.info(f"Query: {query}, BLEU Score: {bleu:.2f}")
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    logger.info(f"Average BLEU Score: {avg_bleu:.2f}")
    return avg_bleu

def main():
    logger.info("Starting RAG Evaluation...")
    retrieval_precision = evaluate_retrieval(ground_truth_docs, retrieved_docs)
    generation_bleu = evaluate_generation(ground_truth_responses, generated_responses)

    logger.info("RAG Evaluation Completed.")
    logger.info(f"Final Results - Retrieval Precision: {retrieval_precision:.2f}, Generation BLEU Score: {generation_bleu:.2f}")

if __name__ == "__main__":
    main()


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [11]:
!python3 /content/RAG.py


INFO:RAG Evaluation:Starting RAG Evaluation...
INFO:RAG Evaluation:Retrieval Precision: 0.50
INFO:RAG Evaluation:Query: query1, BLEU Score: 1.00
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
INFO:RAG Evaluation:Query: query2, BLEU Score: 0.00
INFO:RAG Evaluation:Average BLEU Score: 0.50
INFO:RAG Evaluation:RAG Evaluation Completed.
INFO:RAG Evaluation:Final Results - Retrieval Precision: 0.50, Generation BLEU Score: 0.50
