RAG BENCHMARK

Imports and Setup

In [None]:
try:
    import rapidfireai
    print(" rapidfireai installed")
except ImportError:
    !pip install rapidfireai datasets==3.6.0 langchain sentence-transformers PyPDF2
    !rapidfireai init --evals

In [None]:
import json
import math
import os
from pathlib import Path
from typing import List as listtype, Dict, Any
from collections import defaultdict

# Dataset and ML libraries
from datasets import load_dataset, Dataset
import pandas as pd

# LangChain components
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader, JSONLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

# RapidFire components
from rapidfireai import Experiment
from rapidfireai.automl import List, RFLangChainRagSpec, RFvLLMModelConfig, RFPromptManager, RFGridSearch

Configuration and Setup

In [None]:
print("="*80)
print("üöÄ RAGBench Evaluation Pipeline - Complete Setup")
print("="*80)

# Configuration
DATASET_NAME = "covidqa"  # Options: covidqa, cuad, finqa, hotpotqa, msmarco, etc.
DATASET_SPLIT = "train"
OUTPUT_DIR = Path("./ragbench_output")
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"\nüìã Configuration:")
print(f"  Dataset: {DATASET_NAME}")
print(f"  Split: {DATASET_SPLIT}")
print(f"  Output Directory: {OUTPUT_DIR}")

<small>

================================================================================  
#### üöÄ RAGBench Evaluation Pipeline - Complete Setup  
================================================================================  

üìã Configuration:  
&nbsp;&nbsp;Dataset: covidqa  
&nbsp;&nbsp;Split: train  
&nbsp;&nbsp;Output Directory: ragbench_output  

</small>


Load RAG Bench Dataset

In [None]:
print("\n" + "="*80)
print("üìÇ LOADING RAGBENCH DATASET")
print("="*80)

ragbench_dataset = load_dataset("rungalileo/ragbench", DATASET_NAME, split=DATASET_SPLIT)

print(f"‚úì Loaded {len(ragbench_dataset)} samples from {DATASET_NAME}")
print(f"\nDataset features: {list(ragbench_dataset.features.keys())}")

# Inspect first sample
sample = ragbench_dataset[0]
print(f"\nüìã Sample 0 structure:")
print(f"  - id: {sample['id']}")
print(f"  - question: {sample['question'][:80]}...")
print(f"  - documents: {len(sample['documents'])} documents")
print(f"  - response: {sample['response'][:80]}...")
print(f"  - documents_sentences: {len(sample['documents_sentences'])} document arrays")
print(f"  - all_relevant_sentence_keys: {sample.get('all_relevant_sentence_keys', [])[:5]}")

<small>

================================================================================  
#### üìÇ LOADING RAGBENCH DATASET  
================================================================================  

‚úì Loaded 1252 samples from covidqa  

Dataset features: ['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'relevance_explanation', 'all_relevant_sentence_keys', 'all_utilized_sentence_keys', 'trulens_groundedness', 'trulens_context_relevance', 'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence', 'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score', 'utilization_score', 'completeness_score']  

üìã Sample 0 structure:  
&nbsp;&nbsp;- id: 358  
&nbsp;&nbsp;- question: What role does T-cell count play in severe human adenovirus type 55 (HAdV-55) in...  
&nbsp;&nbsp;- documents: 4 documents  
&nbsp;&nbsp;- response: The T-cell count plays a crucial role in severe human adenovirus type 55 (HAdV-5...  
&nbsp;&nbsp;- documents_sentences: 4 document arrays  
&nbsp;&nbsp;- all_relevant_sentence_keys: ['0d', '0e', '0f', '1d', '1e']  

</small>


Create Corpus from RAG Bench

In [None]:
print("\n" + "="*80)
print("üî® CREATING CORPUS FROM RAGBENCH")
print("="*80)

def create_corpus_from_ragbench(dataset):
    """
    Convert RAGBench dataset to corpus format.

    Creates sentence-level chunks that match ground truth annotations.
    Each sentence gets a unique corpus_id and sent_key.

    Args:
        dataset: HuggingFace Dataset from RAGBench

    Returns:
        corpus_dict: Dictionary mapping corpus_id to chunk info
        sample_to_corpus_mapping: Dictionary mapping sample_id to list of corpus_ids
    """
    corpus_dict = {}
    sample_to_corpus_mapping = {}

    print("  Processing samples...")

    for sample_idx, sample in enumerate(dataset):
        if sample_idx % 100 == 0:
            print(f"    Processed {sample_idx}/{len(dataset)} samples")

        sample_id = sample['id']
        sample_corpus_ids = []

        # Process each document in the sample
        for doc_idx, doc_sentences in enumerate(sample['documents_sentences']):

            # Create a chunk for each sentence
            for sent_idx, sentence in enumerate(doc_sentences):
                # Create unique corpus ID
                corpus_id = f"{sample_id}_doc{doc_idx}_sent{sent_idx}"

                # Create RAGBench-style sentence key: "0a", "0b", "1a", etc.
                sent_key = f"{doc_idx}{chr(97 + sent_idx)}"

                corpus_dict[corpus_id] = {
                    "text": sentence,
                    "sample_id": sample_id,
                    "doc_index": doc_idx,
                    "sent_index": sent_idx,
                    "sent_key": sent_key,
                    "_id": corpus_id,  # Add _id field for JSONLoader compatibility
                }

                sample_corpus_ids.append(corpus_id)

        sample_to_corpus_mapping[sample_id] = sample_corpus_ids

    print(f"  ‚úì Processed {len(dataset)} samples")
    return corpus_dict, sample_to_corpus_mapping

corpus_dict, sample_mapping = create_corpus_from_ragbench(ragbench_dataset)

print(f"\n‚úì Created corpus with {len(corpus_dict)} sentence-level chunks")
print(f"  From {len(ragbench_dataset)} samples")
print(f"  Average chunks per sample: {len(corpus_dict) / len(ragbench_dataset):.1f}")

# Show statistics
sentence_lengths = [len(chunk['text']) for chunk in corpus_dict.values()]
print(f"\nChunk statistics:")
print(f"  Mean length: {sum(sentence_lengths) / len(sentence_lengths):.1f} chars")
print(f"  Min length: {min(sentence_lengths)} chars")
print(f"  Max length: {max(sentence_lengths)} chars")

# Show example
example_corpus_id = list(corpus_dict.keys())[0]
example_chunk = corpus_dict[example_corpus_id]
print(f"\nüìã Example corpus entry:")
print(f"  Corpus ID: {example_corpus_id}")
print(f"  Sent Key: {example_chunk['sent_key']}")
print(f"  Text: {example_chunk['text'][:100]}...")

<small>

================================================================================  
#### üî® CREATING CORPUS FROM RAGBENCH  
================================================================================  

&nbsp;&nbsp;Processing samples...  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 0/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 100/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 200/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 300/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 400/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 500/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 600/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 700/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 800/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 900/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 1000/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 1100/1252 samples  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 1200/1252 samples  
&nbsp;&nbsp;‚úì Processed 1252 samples  

‚úì Created corpus with 23023 sentence-level chunks  
&nbsp;&nbsp;From 1252 samples  
&nbsp;&nbsp;Average chunks per sample: 18.4  

Chunk statistics:  
&nbsp;&nbsp;Mean length: 2.0 chars  
&nbsp;&nbsp;Min length: 2 chars  
&nbsp;&nbsp;Max length: 2 chars  

üìã Example corpus entry:  
&nbsp;&nbsp;Corpus ID: 358_doc0_sent0  
&nbsp;&nbsp;Sent Key: 0a  
&nbsp;&nbsp;Text: ['0a', 'Title: Emergent severe acute respiratory distress syndrome caused by adenovirus type 55 in immunocompetent adults in 2013: a prospective observational study']...  

</small>


Save corpus to JSONL

In [None]:
print("\n" + "="*80)
print("üíæ SAVING CORPUS TO JSONL")
print("="*80)

corpus_file = OUTPUT_DIR / "corpus.jsonl"

with open(corpus_file, 'w') as f:
    for corpus_id, chunk_info in corpus_dict.items():
        json.dump(chunk_info, f)
        f.write('\n')

print(f"‚úì Saved {len(corpus_dict)} corpus entries to {corpus_file}")

# Verify file
with open(corpus_file, 'r') as f:
    first_line = json.loads(f.readline())
    print(f"\n  First line verification:")
    print(f"    _id: {first_line.get('_id')}")
    print(f"    text: {first_line.get('text')[:60]}...")

<small>

================================================================================  
#### üíæ SAVING CORPUS TO JSONL  
================================================================================  

‚úì Saved 23023 corpus entries to ragbench_output/corpus.jsonl  

&nbsp;&nbsp;First line verification:  
&nbsp;&nbsp;&nbsp;&nbsp;_id: 358_doc0_sent0  
&nbsp;&nbsp;&nbsp;&nbsp;text: ['0a', 'Title: Emergent severe acute respiratory distress syndrome caused by adenovirus type 55 in immunocompetent adults in 2013: a prospective observational study']...  

</small>


Create Queries and Qrels

In [None]:
print("\n" + "="*80)
print("üî® CREATING QUERIES AND QRELS")
print("="*80)

def create_queries_and_qrels(dataset, corpus_dict, sample_mapping):
    """
    Create queries and relevance judgments from RAGBench.

    Uses the 'all_relevant_sentence_keys' field to identify relevant chunks.
    Also adds some negative samples for better evaluation.

    Args:
        dataset: HuggingFace Dataset from RAGBench
        corpus_dict: Corpus dictionary
        sample_mapping: Sample to corpus ID mapping

    Returns:
        queries_list: List of query dictionaries
        qrels_rows: List of QRELS entries
    """
    queries_list = []
    qrels_rows = []

    print("  Processing queries and QRELS...")

    for sample_idx, sample in enumerate(dataset):
        if sample_idx % 100 == 0:
            print(f"    Processed {sample_idx}/{len(dataset)} queries")

        sample_id = sample['id']
        question = sample['question']

        # Add query
        queries_list.append({
            "query_id": sample_id,
            "query": question,
        })

        # Get relevant sentence keys (e.g., ["0a", "0d", "1b"])
        relevant_keys = sample.get('all_relevant_sentence_keys', [])

        # Map sentence keys to corpus IDs for this sample
        sample_corpus_ids = sample_mapping[sample_id]

        relevant_corpus_ids = []
        irrelevant_corpus_ids = []

        for corpus_id in sample_corpus_ids:
            chunk_info = corpus_dict[corpus_id]
            sent_key = chunk_info['sent_key']

            # Check if this sentence is relevant
            is_relevant = sent_key in relevant_keys

            if is_relevant:
                relevant_corpus_ids.append(corpus_id)
            else:
                irrelevant_corpus_ids.append(corpus_id)

            # Add to QRELS
            qrels_rows.append({
                "query_id": sample_id,
                "corpus_id": corpus_id,
                "relevance": 1 if is_relevant else 0
            })

    print(f"  ‚úì Processed {len(dataset)} queries")
    return queries_list, qrels_rows

queries_list, qrels_rows = create_queries_and_qrels(
    ragbench_dataset,
    corpus_dict,
    sample_mapping
)

print(f"\n‚úì Created {len(queries_list)} queries")
print(f"‚úì Created {len(qrels_rows)} QRELS entries")

relevant_count = sum(1 for q in qrels_rows if q['relevance'] == 1)
irrelevant_count = len(qrels_rows) - relevant_count

print(f"\n  Class distribution:")
print(f"    Relevant: {relevant_count} ({100*relevant_count/len(qrels_rows):.1f}%)")
print(f"    Irrelevant: {irrelevant_count} ({100*irrelevant_count/len(qrels_rows):.1f}%)")

# Calculate average relevant docs per query
relevant_per_query = defaultdict(int)
for qrel in qrels_rows:
    if qrel['relevance'] == 1:
        relevant_per_query[qrel['query_id']] += 1

avg_relevant = sum(relevant_per_query.values()) / len(relevant_per_query)
print(f"\n  Average relevant sentences per query: {avg_relevant:.1f}")
print(f"  Min relevant: {min(relevant_per_query.values())}")
print(f"  Max relevant: {max(relevant_per_query.values())}")

<small>

================================================================================  
#### üî® CREATING QUERIES AND QRELS  
================================================================================  

&nbsp;&nbsp;Processing queries and QRELS...  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 0/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 100/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 200/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 300/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 400/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 500/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 600/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 700/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 800/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 900/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 1000/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 1100/1252 queries  
&nbsp;&nbsp;&nbsp;&nbsp;Processed 1200/1252 queries  
&nbsp;&nbsp;‚úì Processed 1252 queries  

‚úì Created 1252 queries  
‚úì Created 23023 QRELS entries  

&nbsp;&nbsp;Class distribution:  
&nbsp;&nbsp;&nbsp;&nbsp;Relevant: 6422 (27.9%)  
&nbsp;&nbsp;&nbsp;&nbsp;Irrelevant: 16601 (72.1%)  

&nbsp;&nbsp;Average relevant sentences per query: 5.3  
&nbsp;&nbsp;Min relevant: 1  
&nbsp;&nbsp;Max relevant: 20  

</small>


Create Dataframes

In [None]:
print("\n" + "="*80)
print("üìä CREATING DATAFRAMES")
print("="*80)

# Create queries DataFrame
queries_df = pd.DataFrame(queries_list)
queries_df['query_id'] = queries_df['query_id'].astype(str).str.strip()
queries_df['query'] = queries_df['query'].astype(str).str.strip()

print(f"‚úì Queries DataFrame: {len(queries_df)} rows")
print(f"  Columns: {queries_df.columns.tolist()}")

# Create QRELS DataFrame
qrels_df = pd.DataFrame(qrels_rows)
qrels_df['query_id'] = qrels_df['query_id'].astype(str).str.strip()
qrels_df['corpus_id'] = qrels_df['corpus_id'].astype(str).str.strip()
qrels_df['relevance'] = qrels_df['relevance'].astype(int)

print(f"‚úì QRELS DataFrame: {len(qrels_df)} rows")
print(f"  Columns: {qrels_df.columns.tolist()}")

# Save DataFrames for reference
queries_df.to_csv(OUTPUT_DIR / "queries.csv", index=False)
qrels_df.to_csv(OUTPUT_DIR / "qrels.csv", index=False)
print(f"\n‚úì Saved queries and QRELS to {OUTPUT_DIR}")

<small>

================================================================================  
#### üìä CREATING DATAFRAMES  
================================================================================  

‚úì Queries DataFrame: 1252 rows  
&nbsp;&nbsp;Columns: ['query_id', 'query']  

‚úì QRELS DataFrame: 23023 rows  
&nbsp;&nbsp;Columns: ['query_id', 'corpus_id', 'relevance']  

‚úì Saved queries and QRELS to ragbench_output  

</small>


Validation Checks

In [None]:
print("\n" + "="*80)
print("‚úÖ VALIDATION CHECKS")
print("="*80)

checks = [
    ("Corpus not empty", len(corpus_dict) > 0),
    ("Queries not empty", len(queries_df) > 0),
    ("QRELS not empty", len(qrels_df) > 0),
    ("All query_ids in QRELS exist in queries",
     set(qrels_df['query_id'].unique()).issubset(set(queries_df['query_id']))),
    ("All corpus_ids in QRELS exist in corpus",
     set(qrels_df['corpus_id'].unique()).issubset(set(corpus_dict.keys()))),
    ("Class balance reasonable (10-90% relevant)",
     0.10 < relevant_count/len(qrels_rows) < 0.90),
    ("Corpus file exists", corpus_file.exists()),
]

all_passed = True
for check_name, result in checks:
    status = "‚úì" if result else "‚úó"
    print(f"  {status} {check_name}")
    if not result:
        all_passed = False

if all_passed:
    print("\n‚úÖ All validation checks passed!")
else:
    print("\n‚ö†Ô∏è  Some checks failed - review before proceeding")

<small>

================================================================================  
#### ‚úÖ VALIDATION CHECKS  
================================================================================  

&nbsp;&nbsp;‚úì Corpus not empty  
&nbsp;&nbsp;‚úì Queries not empty  
&nbsp;&nbsp;‚úì QRELS not empty  
&nbsp;&nbsp;‚úì All query_ids in QRELS exist in queries  
&nbsp;&nbsp;‚úì All corpus_ids in QRELS exist in corpus  
&nbsp;&nbsp;‚úì Class balance reasonable (10-90% relevant)  
&nbsp;&nbsp;‚úì Corpus file exists  

‚úÖ All validation checks passed!  

</small>


RAG Configuration

In [None]:
print("\n" + "="*80)
print("‚öôÔ∏è  CONFIGURING RAG PIPELINE")
print("="*80)

batch_size = 32

rag_config = RFLangChainRagSpec(
    document_loader=DirectoryLoader(
        path=str(OUTPUT_DIR),
        glob="corpus.jsonl",
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "text",
            "metadata_func": lambda record, metadata: {
                "corpus_id": str(record.get("_id", "")).strip(),
                "sample_id": str(record.get("sample_id", "")).strip(),
                "sent_key": str(record.get("sent_key", "")).strip(),
                "doc_index": record.get("doc_index", 0),
                "sent_index": record.get("sent_index", 0),
            },
            "json_lines": True,
            "text_content": False,
        },
    ),

    # CRITICAL: Use large chunk_size to prevent re-chunking sentences
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=100000,  # Much larger than sentence length
        chunk_overlap=0,
    ),

    embedding_cls=HuggingFaceEmbeddings,
    embedding_kwargs={
        "model_name": "BAAI/bge-base-en-v1.5",
        "model_kwargs": {"device": "cuda:0"},
        "encode_kwargs": {
            "normalize_embeddings": True,
            "batch_size": batch_size
        },
    },

    vector_store=None,
    search_type="similarity",
    search_kwargs={"k": 10},  # Retrieve top-10 sentences

    # Optional: Add reranker for better results
    reranker_cls=CrossEncoderReranker,
    reranker_kwargs={
        "model_name": "BAAI/bge-reranker-base",
        "model_kwargs": {"device": "cuda:0"},
        "top_n": 5,  # Keep top 5 after reranking
    },

    enable_gpu_search=True,
)


print("‚úì RAG configuration created")
print(f"  Embedding model: BAAI/bge-base-en-v1.5")
print(f"  Retrieval: top-5 similarity search")
print(f"  Reranker: BAAI/bge-reranker-base")


<small>

================================================================================  
#### ‚öôÔ∏è  CONFIGURING RAG PIPELINE  
================================================================================  

‚úì RAG configuration created  
&nbsp;&nbsp;Embedding model: BAAI/bge-base-en-v1.5  
&nbsp;&nbsp;Retrieval: top-5 similarity search  
&nbsp;&nbsp;Reranker: BAAI/bge-reranker-base  

</small>


Preprocessing and Postprocessing Functions

In [None]:
print("\n" + "="*80)
print("üîß DEFINING PREPROCESSING AND POSTPROCESSING")
print("="*80)

def preprocess_fn(
    batch: Dict[str, List],
    rag: RFLangChainRagSpec,
    prompt_manager: RFPromptManager
) -> Dict[str, List]:
    """
    Prepare inputs for the generator model.

    This function:
    1. Retrieves relevant documents for each query
    2. Extracts corpus IDs from retrieved documents
    3. Formats prompts for the LLM

    Args:
        batch: Batch of queries
        rag: RAG specification
        prompt_manager: Prompt manager (unused here)

    Returns:
        Dictionary with prompts and retrieved documents
    """
    INSTRUCTIONS = (
        f"You are an expert assistant for {DATASET_NAME} questions. "
        "Answer questions based on the provided context documents. "
        "Be precise, cite specific information, and reference the source when possible."
    )

    # Ensure queries are clean strings
    batch_queries = [str(q).strip() for q in batch["query"]]

    # Perform retrieval
    all_context = rag.get_context(batch_queries=batch_queries, serialize=False)

    # Extract corpus IDs from retrieved documents
    retrieved_documents = [
        [str(doc.metadata.get("corpus_id", "")).strip() for doc in docs]
        for docs in all_context
    ]

    # Serialize context for LLM
    serialized_context = rag.serialize_documents(all_context)

    return {
        "prompts": [
            [
                {"role": "system", "content": INSTRUCTIONS},
                {
                    "role": "user",
                    "content": f"Context Documents:\n{context}\n\nQuestion: {question}\n\nAnswer:"
                },
            ]
            for question, context in zip(batch_queries, serialized_context)
        ],
        "retrieved_documents": retrieved_documents,
        **{k: list(v) for k, v in batch.items()},
    }

def postprocess_fn(batch: Dict[str, List]) -> Dict[str, List]:
    """
    Add ground truth documents to batch for evaluation.

    Args:
        batch: Batch with retrieved documents

    Returns:
        Batch with ground_truth_documents added
    """
    gt_docs = []

    for qid in batch["query_id"]:
        target_qid = str(qid).strip()
        # Get only RELEVANT documents (relevance = 1) from QRELS
        relevant = qrels_df[
            (qrels_df["query_id"] == target_qid) &
            (qrels_df["relevance"] == 1)
        ]["corpus_id"].tolist()
        gt_docs.append(relevant)

    batch["ground_truth_documents"] = gt_docs
    return batch

print("‚úì Preprocessing function defined")
print("‚úì Postprocessing function defined")


<small>

================================================================================  
#### üîß DEFINING PREPROCESSING AND POSTPROCESSING  
================================================================================  

‚úì Preprocessing function defined  
‚úì Postprocessing function defined  

</small>


Evaluation Metrics

In [None]:
print("\n" + "="*80)
print("üìè DEFINING EVALUATION METRICS")
print("="*80)

def compute_metrics_fn(batch: Dict[str, List]) -> Dict[str, Dict[str, Any]]:
    """
    Compute RAG evaluation metrics.

    Metrics:
    - Precision: Fraction of retrieved chunks that are relevant
    - Recall: Fraction of relevant chunks that were retrieved
    - F1: Harmonic mean of precision and recall
    - NDCG@k: Normalized discounted cumulative gain
    - MRR: Mean reciprocal rank
    - Hit Rate: At least one relevant chunk retrieved

    Args:
        batch: Batch with retrieved_documents and ground_truth_documents

    Returns:
        Dictionary of computed metrics
    """
    precisions, recalls, f1s, ndcgs, mrrs, hits = [], [], [], [], [], []

    for pred, gt in zip(batch["retrieved_documents"], batch["ground_truth_documents"]):
        # Convert to sets of strings
        predicted = set(str(p).strip() for p in pred)
        expected = set(str(g).strip() for g in gt)

        if not expected:
            # Skip queries with no ground truth
            continue

        # Calculate overlap
        tp = len(predicted & expected)
        fp = len(predicted - expected)
        fn = len(expected - predicted)

        # Metrics
        precision = tp / len(predicted) if predicted else 0
        recall = tp / len(expected) if expected else 0
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

        # Hit rate: Did we retrieve at least one relevant document?
        hits.append(1 if tp > 0 else 0)

        # MRR: Mean Reciprocal Rank
        rr = 0
        for j, p in enumerate(pred):
            if str(p).strip() in expected:
                rr = 1 / (j + 1)
                break
        mrrs.append(rr)

        # NDCG@k
        k = len(pred)
        relevance = [1 if str(doc).strip() in expected else 0 for doc in pred[:k]]
        dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(relevance))
        idcg = sum(1 / math.log2(i + 2) for i in range(min(k, len(expected))))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcgs.append(ndcg)

    total = len(batch["query"])

    return {
        "Total": {"value": total},
        "Precision": {"value": sum(precisions) / total if total > 0 else 0},
        "Recall": {"value": sum(recalls) / total if total > 0 else 0},
        "F1_Score": {"value": sum(f1s) / total if total > 0 else 0},
        "NDCG@k": {"value": sum(ndcgs) / total if total > 0 else 0},
        "MRR": {"value": sum(mrrs) / total if total > 0 else 0},
        "Hit_Rate": {"value": sum(hits) / total if total > 0 else 0},
    }

def accumulate_metrics_fn(
    aggregated_metrics: Dict[str, List]
) -> Dict[str, Dict[str, Any]]:
    """
    Accumulate metrics across all batches.

    Args:
        aggregated_metrics: Metrics from all batches

    Returns:
        Final aggregated metrics
    """
    num_queries_per_batch = [m["value"] for m in aggregated_metrics["Total"]]
    total_queries = sum(num_queries_per_batch)
    metrics = ["Hit_Rate", "Precision", "Recall", "F1_Score", "NDCG@k", "MRR"]

    return {
        "Total": {"value": total_queries},
        **{
            m: {
                "value": sum(
                    v["value"] * queries
                    for v, queries in zip(aggregated_metrics[m], num_queries_per_batch)
                ) / total_queries if total_queries > 0 else 0,
                "is_algebraic": True,
                "value_range": (0, 1),
            }
            for m in metrics
        }
    }

print("‚úì Metrics functions defined")
print("  Metrics: Precision, Recall, F1, NDCG@k, MRR, Hit Rate")

<small>

================================================================================  
#### üìè DEFINING EVALUATION METRICS  
================================================================================  

‚úì Metrics functions defined  
&nbsp;&nbsp;Metrics: Precision, Recall, F1, NDCG@k, MRR, Hit Rate  

</small>


vLLM Model Configuration

In [None]:
print("\n" + "="*80)
print("ü§ñ CONFIGURING vLLM MODEL")
print("="*80)

vllm_config = RFvLLMModelConfig(
    model_config={
        "model": "Qwen/Qwen2.5-0.5B-Instruct",
        "dtype": "half",
        "gpu_memory_utilization": 0.3,
        "enforce_eager": True,
        "max_model_len": 4096,
        "disable_log_stats": True,
        "tensor_parallel_size": 1,
        "distributed_executor_backend": "mp",
    },
    sampling_params={
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 256,  # Reasonable length for answers
    },
    rag=rag_config,
)

print("‚úì vLLM configuration created")
print(f"  Model: Qwen/Qwen2.5-0.5B-Instruct")
print(f"  Max tokens: 256")
print(f"  Temperature: 0.7")

<small>

================================================================================  
#### ü§ñ CONFIGURING vLLM MODEL  
================================================================================  

‚úì vLLM configuration created  
&nbsp;&nbsp;Model: Qwen/Qwen2.5-0.5B-Instruct  
&nbsp;&nbsp;Max tokens: 256  
&nbsp;&nbsp;Temperature: 0.7  

</small>


Experiment Configuration

In [None]:
print("\n" + "="*80)
print("üî¨ CONFIGURING EXPERIMENT")
print("="*80)

config_set = {
    "vllm_config": vllm_config,
    "batch_size": 4,  # Process 4 queries at a time
    "preprocess_fn": preprocess_fn,
    "postprocess_fn": postprocess_fn,
    "compute_metrics_fn": compute_metrics_fn,
    "accumulate_metrics_fn": accumulate_metrics_fn,
    "online_strategy_kwargs": {
        "strategy_name": "normal",
        "confidence_level": 0.95,
        "use_fpc": True,
    },
}

config_group = RFGridSearch(config_set)

print("‚úì Experiment configuration created")
print(f"  Batch size: 4")
print(f"  Confidence level: 95%")

<small>

================================================================================  
#### üî¨ CONFIGURING EXPERIMENT  
================================================================================  

‚úì Experiment configuration created  
&nbsp;&nbsp;Batch size: 4  
&nbsp;&nbsp;Confidence level: 95%  

</small>


Convert to HuggingFace Dataset

In [None]:
print("\n" + "="*80)
print("üì¶ PREPARING DATASET FOR EVALUATION")
print("="*80)

queries_dataset = Dataset.from_pandas(queries_df)

print(f"‚úì Created HuggingFace Dataset with {len(queries_dataset)} queries")
print(f"  Features: {list(queries_dataset.features.keys())}")


<small>

================================================================================  
#### üì¶ PREPARING DATASET FOR EVALUATION  
================================================================================  

‚úì Created HuggingFace Dataset with 1252 queries  
&nbsp;&nbsp;Features: ['query_id', 'query']  

</small>


Run Experiment

In [None]:
print("\n" + "="*80)
print("üöÄ STARTING RAGBench EVALUATION")
print("="*80)

print(f"\nüìä Evaluation Summary:")
print(f"  Dataset: {DATASET_NAME} ({DATASET_SPLIT} split)")
print(f"  Queries: {len(queries_dataset)}")
print(f"  Corpus: {len(corpus_dict)} sentence-level chunks")
print(f"  QRELS entries: {len(qrels_df)}")
print(f"  Relevant entries: {relevant_count} ({100*relevant_count/len(qrels_rows):.1f}%)")
print(f"  Model: Qwen/Qwen2.5-0.5B-Instruct")
print("="*80 + "\n")

# Create experiment
experiment = Experiment(
    experiment_name=f"ragbench-{DATASET_NAME}-evaluation",
    mode="evals",
)

# Run evaluation
print("üèÉ Running evaluation... (this may take several minutes)\n")

results = experiment.run_evals(
    config_group=config_group,
    dataset=queries_dataset,
    num_actors=1,
    num_shards=4,
    seed=42
)

# End experiment
experiment.end()

print("\n" + "="*80)
print("‚úÖ EVALUATION COMPLETE")
print("="*80)


<small>

================================================================================  
#### üöÄ STARTING RAGBench EVALUATION  
================================================================================  

#### üìä Evaluation Summary:  
#### &nbsp;&nbsp;Dataset: covidqa (train split)  
#### &nbsp;&nbsp;Queries: 1252  
#### &nbsp;&nbsp;Corpus: 23023 sentence-level chunks  
#### &nbsp;&nbsp;QRELS entries: 23023  
#### &nbsp;&nbsp;Relevant entries: 6422 (27.9%)  
#### &nbsp;&nbsp;Model: Qwen/Qwen2.5-0.5B-Instruct  
================================================================================  

The previously running experiment ragbench-covidqa-evaluation_1 was forcibly ended. Created a new experiment 'ragbench-covidqa-evaluation_2' with Experiment ID: 3 at /content/rapidfireai/rapidfire_experiments/ragbench-covidqa-evaluation_2  
üåê Google Colab detected. Ray dashboard URL: https://8855-gpu-t4-hm-1uzxkq7hu4p2i-c.europe-west4-1.prod.colab.dev  
üåê Google Colab detected. Dispatcher URL: https://8851-gpu-t4-hm-1uzxkq7hu4p2i-c.europe-west4-1.prod.colab.dev  
üèÉ Running evaluation... (this may take several minutes)  

</small>


<small>

### === Preprocessing RAG Sources ===

| RAG Source ID | Status   | Duration | Details    |
|---------------|---------|----------|------------|
| 1             | Complete | 98.6s    | FAISS, GPU |

---

### === Multi-Config Experiment Progress ===

| Run ID | Model                        | Status    | Progress | Conf. Interval | search_type | rag_k | top_n | chunk_size   | chunk_overlap | sampling_params                                      | model_config                                                                                                                         | Precision                    | Recall                       | MRR                          | Throughput | Total | Samples Processed | F1_Score                    | Hit_Rate                    | NDCG@k                     | Processing Time   | Samples Per Second | model_name                    | run_id |
|--------|------------------------------|----------|---------|----------------|------------|-------|-------|--------------|---------------|-----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|-------------------------------|-------------------------------|-------------------------------|------------|-------|------------------|-------------------------------|-------------------------------|-----------------------------|-----------------|-----------------|-------------------------------|--------|
| 1      | Qwen/Qwen2.5-0.5B-Instruct  | COMPLETED | 4/4     | 0.000          | similarity | 10.00 | 5.00  | 100000.00    | 0.0000        | {'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 256} | {'dtype': 'half', 'gpu_memory_utilization': 0.3, 'enforce_eager': True, 'max_model_len': 4096, 'disable_log_stats': True, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp'} | 11.93% [11.93%, 11.93%]     | 29.34% [29.34%, 29.34%]     | 39.02% [39.02%, 39.02%]     | 0.6/s      | 1,252 | 1,252           | 0.1545 [0.1545, 0.1545]      | 0.6526 [0.6526, 0.6526]      | 0.2622 [0.2622, 0.2622]    | 2206.19 seconds | 0.57            | Qwen/Qwen2.5-0.5B-Instruct  | 1.00   |

**Note:** Experiment `ragbench-covidqa-evaluation_2` ended

</small>


Analyze Results

In [None]:
print("\n" + "="*80)
print("üìä RESULTS ANALYSIS")
print("="*80)

# Results are automatically displayed by RapidFire
# Additional analysis can be done here if needed

print("\n‚úÖ Pipeline complete! Check the results above.")
print(f"üìÅ Output files saved to: {OUTPUT_DIR}")
print(f"   - corpus.jsonl")
print(f"   - queries.csv")
print(f"   - qrels.csv")

print("\n" + "="*80)
print("üéâ RAGBench Evaluation Pipeline Finished Successfully!")
print("="*80)

#### üìä RESULTS ANALYSIS
================================================================================

‚úÖ Pipeline complete! Check the results above.  
üìÅ Output files saved to: `ragbench_output`  
- `corpus.jsonl`  
- `queries.csv`  
- `qrels.csv`  

---

#### Metrics (Example from experiment)

- **Precision**: 11.93%   
- **Recall**: 29.34%  
- **MRR**: 39.02%  
- **F1 Score**: 0.1545   
- **Hit Rate**: 0.6526  
- **NDCG@k**: 0.2622 
- **Throughput**: 0.6/s  
- **Processing Time**: 2206.19 seconds  
- **Samples Per Second**: 0.57  

---

#### üéâ RAGBench Evaluation Pipeline Finished Successfully!
================================================================================
