Install & setup

In [1]:
!pip install PyPDF2



In [None]:
try:
    import rapidfireai
    print(" rapidfireai installed")
except ImportError:
    !pip install rapidfireai datasets==3.6.0 langchain sentence-transformers PyPDF2
    !rapidfireai init --evals

‚úÖ rapidfireai installed


Imports

In [3]:
import os
import math
import json
import pandas as pd
from pathlib import Path
from typing import List as listtype, Dict, Any
from datasets import Dataset
from collections import defaultdict
import re
import PyPDF2

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

from rapidfireai import Experiment
from rapidfireai.automl import List, RFLangChainRagSpec, RFvLLMModelConfig, RFPromptManager, RFGridSearch

from langchain_community.document_loaders import DirectoryLoader, JSONLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker

DOCUGAMI FINANCIAL QA BENCHMARK

Setup and Load Dataset

In [4]:
# Directories
dataset_dir = Path("./financial_rag_benchmark")
dataset_dir.mkdir(exist_ok=True)

pdf_dir = Path("./pdfs")  # Directory containing your PDFs like "2022 Q3 AAPL.pdf"
# pdf_dir.mkdir(exist_ok=True)

print(" Loading Docugami Financial QA Benchmark...")

# Load your CSV file with the QA data
csv_file_path = "qna_data.csv"  # UPDATE THIS PATH

# If you need to upload:
# from google.colab import files
# uploaded = files.upload()
# csv_file_path = list(uploaded.keys())[0]

df = pd.read_csv(csv_file_path)

print(f" Loaded {len(df)} questions from benchmark")
print(f"\nDataset columns: {df.columns.tolist()}")

# Show sample
print("\n Sample question:")
print(df.head(1).to_dict('records'))

 Loading Docugami Financial QA Benchmark...
 Loaded 195 questions from benchmark

Dataset columns: ['Question', 'Source Docs', 'Question Type', 'Source Chunk Type', 'Answer']

 Sample question:
[{'Question': "How has Apple's total net sales changed over time?", 'Source Docs': '*AAPL*', 'Question Type': 'Multi-Doc RAG', 'Source Chunk Type': 'Table', 'Answer': "Based on the provided documents, Apple's total net sales have changed over time as follows:\n\n- For the quarterly period ended June 25, 2022, the total net sales were $82,959 million. (SOURCE: 2022 Q3 AAPL.pdf)\n- For the quarterly period ended December 31, 2022, the total net sales were $117,154 million. (SOURCE: 2023 Q1 AAPL.pdf)\n- For the quarterly period ended April 1, 2023, the total net sales were $94,836 million. (SOURCE: 2023 Q2 AAPL.pdf)\n- For the quarterly period ended July 1, 2023, the total net sales were $81,797 million. (SOURCE: 2023 Q3 AAPL.pdf)\n\nFrom these figures, it can be observed that there was an increase

Process PDF Documents into chunks

In [5]:
print("\nüî® Processing PDF documents into chunks...")

def extract_pdf_text(pdf_path):
    """Extract text from PDF file"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()
                text += f"\n[Page {page_num + 1}]\n{page_text}"
            return text
    except Exception as e:
        print(f"    Error reading {pdf_path}: {e}")
        return ""

def parse_source_docs(source_docs_str):
    """
    Parse source docs string like '*2023 Q3 AAPL*' or '*MSFT*'
    Returns list of document identifiers
    """
    if pd.isna(source_docs_str) or not source_docs_str:
        return []

    # Remove asterisks and split by comma
    cleaned = source_docs_str.replace('*', '').strip()

    # Split by comma if multiple sources
    sources = [s.strip() for s in cleaned.split(',')]

    return sources

def match_pdf_files(source_pattern, pdf_files):
    """
    Match PDF files based on source pattern
    e.g., '*MSFT*' matches all MSFT files
    e.g., '*2023 Q3 AAPL*' matches that specific file
    """
    matched_files = []

    for pdf_file in pdf_files:
        pdf_name = pdf_file.stem  # Filename without extension

        for pattern in source_pattern:
            # Check if pattern matches
            if pattern in pdf_name:
                matched_files.append(pdf_file)
                break

    return matched_files

# Get all PDF files
if pdf_dir.exists():
    pdf_files = list(pdf_dir.glob("*.pdf"))
    print(f"   Found {len(pdf_files)} PDF files")
else:
    print(f"    PDF directory not found: {pdf_dir}")
    print(f"   Please create the directory and add your PDF files")
    pdf_files = []

# Create corpus from PDFs
corpus_dict = {}  # doc_id -> {text, metadata}
doc_to_source_mapping = {}  # Maps doc_id to source PDF name
doc_counter = 0

# Process each PDF
for pdf_file in pdf_files:
    print(f"   Processing: {pdf_file.name}")

    # Extract text
    full_text = extract_pdf_text(pdf_file)

    if not full_text or len(full_text) < 100:
        print(f"    Skipping {pdf_file.name} - insufficient text")
        continue

    # Split into chunks using RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Larger chunks for financial documents
        chunk_overlap=200,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    chunks = text_splitter.split_text(full_text)

    # Create documents for each chunk
    source_name = pdf_file.stem  # e.g., "2023 Q3 AAPL"

    for chunk_idx, chunk_text in enumerate(chunks):
        if len(chunk_text.strip()) < 50:  # Skip very small chunks
            continue

        doc_id = f"doc_{doc_counter}"
        doc_counter += 1

        # Store chunk with metadata
        corpus_dict[doc_id] = {
            "text": chunk_text.strip(),
            "source_file": source_name,
            "chunk_index": chunk_idx,
            "total_chunks": len(chunks)
        }

        # Map doc_id to source
        doc_to_source_mapping[doc_id] = source_name

    print(f"   ‚úì Created {len(chunks)} chunks from {pdf_file.name}")

print(f"\n Total corpus documents (chunks): {len(corpus_dict)}")



üî® Processing PDF documents into chunks...
   Found 20 PDF files
   Processing: 2023 Q2 MSFT.pdf
   ‚úì Created 290 chunks from 2023 Q2 MSFT.pdf
   Processing: 2022 Q3 NVDA.pdf
   ‚úì Created 183 chunks from 2022 Q3 NVDA.pdf
   Processing: 2023 Q2 INTC.pdf
   ‚úì Created 178 chunks from 2023 Q2 INTC.pdf
   Processing: 2023 Q1 NVDA.pdf
   ‚úì Created 187 chunks from 2023 Q1 NVDA.pdf
   Processing: 2022 Q3 AMZN.pdf
   ‚úì Created 229 chunks from 2022 Q3 AMZN.pdf
   Processing: 2023 Q3 AMZN.pdf
   ‚úì Created 233 chunks from 2023 Q3 AMZN.pdf
   Processing: 2023 Q1 MSFT.pdf
   ‚úì Created 290 chunks from 2023 Q1 MSFT.pdf
   Processing: 2022 Q3 AAPL.pdf
   ‚úì Created 89 chunks from 2022 Q3 AAPL.pdf
   Processing: 2023 Q2 AAPL.pdf
   ‚úì Created 88 chunks from 2023 Q2 AAPL.pdf
   Processing: 2023 Q1 AMZN.pdf
   ‚úì Created 221 chunks from 2023 Q1 AMZN.pdf
   Processing: 2023 Q3 AAPL.pdf
   ‚úì Created 87 chunks from 2023 Q3 AAPL.pdf
   Processing: 2023 Q2 AMZN.pdf
   ‚úì Created 227 chun

Create Queries and Qrels

In [6]:
print("\nüî® Creating queries and relevance judgments (QRELS)...")

queries_list = []
qrels_rows = []

# Track question types
question_type_counts = defaultdict(int)

for i, row in df.iterrows():
    if i % 50 == 0:
        print(f"   Processing question {i}/{len(df)}...")

    query_id = f"q_{i}"
    question = str(row['Question']).strip()
    source_docs_str = str(row['Source Docs']).strip()
    question_type = str(row.get('Question Type', 'Unknown')).strip()

    # Parse source documents
    source_patterns = parse_source_docs(source_docs_str)

    if not source_patterns:
        print(f"    Skipping question {i} - no source docs")
        continue

    # Add query
    queries_list.append({
        "query_id": query_id,
        "query": question,
        "question_type": question_type
    })

    # Track question type
    question_type_counts[question_type] += 1

    # Find all relevant document chunks (relevance=1)
    relevant_doc_ids = []

    for doc_id, doc_info in corpus_dict.items():
        source_file = doc_info["source_file"]

        # Check if this chunk's source file matches any of the source patterns
        is_relevant = False
        for pattern in source_patterns:
            if pattern in source_file:
                is_relevant = True
                break

        if is_relevant:
            relevant_doc_ids.append(doc_id)

    # Add relevant documents to QRELS
    for doc_id in relevant_doc_ids:
        qrels_rows.append({
            "query_id": query_id,
            "corpus_id": doc_id,
            "relevance": 1
        })

    # Add some irrelevant documents (relevance=0) for realistic evaluation
    # Sample from documents NOT in the relevant set
    irrelevant_doc_ids = [
        doc_id for doc_id in corpus_dict.keys()
        if doc_id not in relevant_doc_ids
    ]

    # Sample up to 10 irrelevant documents
    import random
    random.seed(42 + i)
    num_negatives = min(10, len(irrelevant_doc_ids))

    if num_negatives > 0:
        sampled_negatives = random.sample(irrelevant_doc_ids, num_negatives)

        for neg_doc_id in sampled_negatives:
            qrels_rows.append({
                "query_id": query_id,
                "corpus_id": neg_doc_id,
                "relevance": 0
            })

print(f"\n Queries created: {len(queries_list)}")
print(f"\n Question Type Distribution:")
for q_type, count in sorted(question_type_counts.items()):
    print(f"   {q_type}: {count}")



üî® Creating queries and relevance judgments (QRELS)...
   Processing question 0/195...
   Processing question 50/195...
   Processing question 100/195...
   Processing question 150/195...

 Queries created: 195

 Question Type Distribution:
   Multi-Doc RAG: 65
   Single-Doc Multi-Chunk RAG: 54
   Single-Doc Single-Chunk RAG: 76


Save Data and Create Dataframes

In [7]:
print("\n Saving corpus and creating dataframes...")

# Save corpus
corpus_list = [
    {
        "_id": doc_id,
        "text": info["text"],
        "source_file": info["source_file"],
        "chunk_index": info["chunk_index"]
    }
    for doc_id, info in corpus_dict.items()
]

corpus_file = dataset_dir / "corpus_sampled.jsonl"
with open(corpus_file, "w") as f:
    for doc in corpus_list:
        f.write(json.dumps(doc) + "\n")

print(f" Saved corpus to {corpus_file}")

# Create dataframes
queries_df = pd.DataFrame(queries_list)
queries_df['query_id'] = queries_df['query_id'].astype(str).str.strip()
queries_df['query'] = queries_df['query'].astype(str).str.strip()

qrels_df = pd.DataFrame(qrels_rows)
qrels_df['query_id'] = qrels_df['query_id'].astype(str).str.strip()
qrels_df['corpus_id'] = qrels_df['corpus_id'].astype(str).str.strip()
qrels_df['relevance'] = qrels_df['relevance'].astype(int)


 Saving corpus and creating dataframes...
 Saved corpus to financial_rag_benchmark/corpus_sampled.jsonl


Dataset Statistics and Verification

In [8]:
print("\n" + "="*70)
print(" DOCUGAMI FINANCIAL QA BENCHMARK SUMMARY")
print("="*70)
print(f" Total Queries: {len(queries_df)}")
print(f" Total Corpus Documents (chunks): {len(corpus_list)}")
print(f" Total QRELS entries: {len(qrels_df)}")
print(f" Relevant documents (relevance=1): {len(qrels_df[qrels_df['relevance'] == 1])}")
print(f" Irrelevant documents (relevance=0): {len(qrels_df[qrels_df['relevance'] == 0])}")

# Calculate statistics
docs_per_query = qrels_df.groupby('query_id').size()
print(f"\n Average documents per query: {docs_per_query.mean():.1f}")

relevant_per_query = qrels_df[qrels_df['relevance'] == 1].groupby('query_id').size()
print(f" Average relevant documents per query: {relevant_per_query.mean():.1f}")
print(f" Min relevant docs: {relevant_per_query.min()}, Max: {relevant_per_query.max()}")

irrelevant_per_query = qrels_df[qrels_df['relevance'] == 0].groupby('query_id').size()
print(f" Average irrelevant documents per query: {irrelevant_per_query.mean():.1f}")

# Show sample for each question type
print("\n" + "="*70)
print(" SAMPLE QUESTIONS BY TYPE")
print("="*70)

for q_type in question_type_counts.keys():
    sample_query = queries_df[queries_df['question_type'] == q_type].iloc[0]
    sample_qid = sample_query['query_id']

    print(f"\n{q_type}:")
    print(f"   Question: {sample_query['query'][:150]}...")

    sample_qrels = qrels_df[qrels_df['query_id'] == sample_qid]
    relevant_docs = sample_qrels[sample_qrels['relevance'] == 1]['corpus_id'].tolist()

    print(f"   Relevant chunks: {len(relevant_docs)}")

    # Show which source files these chunks come from
    source_files = set()
    for doc_id in relevant_docs[:5]:  # Show first 5
        source_file = corpus_dict[doc_id]["source_file"]
        source_files.add(source_file)

    print(f"   Source files: {', '.join(list(source_files)[:3])}...")

print("="*70 + "\n")




 DOCUGAMI FINANCIAL QA BENCHMARK SUMMARY
 Total Queries: 195
 Total Corpus Documents (chunks): 4008
 Total QRELS entries: 80431
 Relevant documents (relevance=1): 78481
 Irrelevant documents (relevance=0): 1950

 Average documents per query: 412.5
 Average relevant documents per query: 402.5
 Min relevant docs: 87, Max: 1162
 Average irrelevant documents per query: 10.0

 SAMPLE QUESTIONS BY TYPE

Multi-Doc RAG:
   Question: How has Apple's total net sales changed over time?...
   Relevant chunks: 423
   Source files: 2022 Q3 AAPL...

Single-Doc Single-Chunk RAG:
   Question: What significant changes, if any, in accounting practices were reported by NVIDIA in its most recent 10-Q?...
   Relevant chunks: 207
   Source files: 2023 Q3 NVDA...

Single-Doc Multi-Chunk RAG:
   Question: How does Microsoft's revenue distribution across its various business segments in the latest 10-Q compare to the cost of sales for those segments?...
   Relevant chunks: 259
   Source files: 2023 Q3 MSFT...


Define RAG search space (retrieval-focused)

In [9]:

batch_size = 50

rag_config = RFLangChainRagSpec(
    document_loader=DirectoryLoader(
        path=str(dataset_dir),
        glob="corpus_sampled.jsonl",
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "text",
            "metadata_func": lambda record, metadata: {
                "corpus_id": str(record.get("_id", "")).strip(),
                "source_file": str(record.get("source_file", "")).strip(),
                "chunk_index": record.get("chunk_index", 0)
            },
            "json_lines": True,
            "text_content": False,
        },
        sample_seed=42,
    ),
    # Use moderate chunking since PDFs are already pre-chunked
    text_splitter= RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=128
    ),
    # List([
    #         RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    #             encoding_name="gpt2", chunk_size=512, chunk_overlap=128
    #         ),
    #         RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    #             encoding_name="gpt2", chunk_size=128, chunk_overlap=32
    #         ),
    #     ],
    # ),

    embedding_cls=HuggingFaceEmbeddings,
    embedding_kwargs={
        "model_name": "sentence-transformers/all-MiniLM-L6-v2",
        "model_kwargs": {"device": "cuda:0"}, #"cpu"
        "encode_kwargs": {
            "normalize_embeddings": True,
            "batch_size": batch_size
        },
    },
    vector_store=None,
    search_type="similarity",
    search_kwargs={"k": 8},  # Retrieve more for multi-doc questions
    reranker_cls=CrossEncoderReranker,
    reranker_kwargs={
        "model_name": "cross-encoder/ms-marco-MiniLM-L6-v2",
        "model_kwargs": {"device": "cuda:0"}, #"cpu"
        "top_n": 5,
        # "top_n": List([3, 5]),  # Keep top 8 after reranking
    },
    enable_gpu_search=True,
)



### ‚ö†Ô∏è Important Note

Running the code cell below **may trigger a pickling error** when `experiment.run_evals()` is executed.  
This happens due to the invocation of `rag_config.build_index()`.

---

## Recommended Usage

##### To test document retrieval only
- Uncomment the code cell below.
- Run it to verify that the correct documents are being retrieved for your queries.
- After testing, **restart the session and runtime**.

##### To run RapidFireAI experiments
- Keep the code cell below **commented out**.
- Run the rest of the notebook as-is.
- This avoids pickling issues during `experiment.run_evals()`.

---

### Summary Workflow
1. *(Optional)* Uncomment and run the cell to test RAG document retrieval.
2. Restart the session and runtime.
3. Comment the cell again.
4. Run the full notebook to execute RapidFireAI experiments safely.


In [1]:
# print("\n" + "="*70)
# print(" TESTING RAG CONFIGURATION")
# print("="*70)

# print("\n Building FAISS index...")
# rag_config.build_index()
# print(" Index built successfully")

# # Check vector store
# if hasattr(rag_config, 'retriever') and rag_config.retriever:
#     if hasattr(rag_config.retriever, 'vectorstore'):
#         vs = rag_config.retriever.vectorstore
#         print(f" Vector store contains {vs.index.ntotal} vectors")

# # Test retrieval for different question types
# for q_type in ['Single-Doc RAG', 'Multi-Doc RAG']:
#     type_queries = queries_df[queries_df['question_type'].str.contains(q_type, na=False)]

#     if len(type_queries) == 0:
#         continue

#     test_query_id = type_queries.iloc[0]['query_id']
#     test_query = type_queries.iloc[0]['query']

#     print(f"\n Testing {q_type}:")
#     print(f"   Query: '{test_query[:100]}...'")

#     test_result = rag_config.get_context(batch_queries=[test_query], serialize=False)

#     if test_result and test_result[0]:
#         print(f"    Retrieved {len(test_result[0])} documents")

#         # Show top 3
#         retrieved_sources = set()
#         for i, doc in enumerate(test_result[0][:3]):
#             corpus_id = doc.metadata.get('corpus_id', 'MISSING')
#             source_file = doc.metadata.get('source_file', 'UNKNOWN')
#             content_preview = doc.page_content[:100].replace('\n', ' ')

#             retrieved_sources.add(source_file)

#             print(f"\n   Rank {i+1}:")
#             print(f"      Doc ID: {corpus_id}")
#             print(f"      Source: {source_file}")
#             print(f"      Content: '{content_preview}...'")

#         # Check relevance
#         retrieved_ids = [doc.metadata.get('corpus_id', '') for doc in test_result[0]]
#         expected_docs = qrels_df[
#             (qrels_df['query_id'] == test_query_id) &
#             (qrels_df['relevance'] == 1)
#         ]['corpus_id'].tolist()

#         matches = set(retrieved_ids).intersection(set(expected_docs))

#         # Get expected sources
#         expected_sources = set()
#         for doc_id in expected_docs[:10]:
#             if doc_id in corpus_dict:
#                 expected_sources.add(corpus_dict[doc_id]["source_file"])

#         print(f"\n    Relevance Check:")
#         print(f"      Expected relevant chunks: {len(expected_docs)}")
#         print(f"      Expected sources: {', '.join(list(expected_sources)[:3])}")
#         print(f"      Retrieved relevant chunks: {len(matches)}")
#         print(f"      Retrieved sources: {', '.join(list(retrieved_sources))}")
#         print(f"      Precision: {len(matches)}/{len(retrieved_ids)} = {len(matches)/len(retrieved_ids):.2%}")

#         if matches:
#             print(f"       Successfully retrieved relevant documents!")
#         else:
#             print(f"       No relevant documents in top results")

# print("="*70 + "\n")

<small>

**Building FAISS index‚Ä¶**  

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab
(https://huggingface.co/settings/tokens), set it as secret in your Google Colab
and restart your session.
Authentication is recommended but optional for public models or datasets.


**Index built successfully**  
Vector store contains **10,355 vectors**

---

### Testing Multi-Doc RAG

**Query:**  
> *How has Apple's total net sales changed over time?*

**Retrieved:** 5 documents

**Rank 1**
- **Doc ID:** doc_1733  
- **Source:** 2023 Q2 AAPL  
- **Content:** Services 20,907 ‚Üí 19,821 (+5%), Total net sales  
  \$94,836 ‚Üí \$97,278 (-3%), \$211,990 ‚Üí \$221,2‚Ä¶

**Rank 2**
- **Doc ID:** doc_2042  
- **Source:** 2023 Q3 AAPL  
- **Content:** Lower net sales of Mac. Apple Inc. | Q3 2023 Form 10-Q | Page 19‚Ä¶

**Rank 3**
- **Doc ID:** doc_1732  
- **Source:** 2023 Q2 AAPL  
- **Content:** Higher net sales of iPhone, offset by lower Mac sales‚Ä¶

---

### Relevance Check

- **Expected relevant chunks:** 423  
- **Expected sources:** 2022 Q3 AAPL  
- **Retrieved relevant chunks:** 5  
- **Retrieved sources:** 2023 Q2 AAPL, 2023 Q3 AAPL  

**Precision:**  
> **5 / 5 = 100.00%** ‚úÖ  
Successfully retrieved relevant documents!

</small>

Preprocess (retrieval-only focus)

In [10]:
def sample_preprocess_fn(
    batch: Dict[str, listtype],
    rag: RFLangChainRagSpec,
    prompt_manager: RFPromptManager
) -> Dict[str, listtype]:
    """Prepare inputs for the generator model"""

    INSTRUCTIONS = (
        "You are a financial analyst assistant. Answer questions about tech company "
        "financial performance based on quarterly 10-Q reports. Be precise, cite specific "
        "figures, and reference the source documents when possible."
    )

    # Ensure queries are clean strings
    batch_queries = [str(q).strip() for q in batch["query"]]

    # Perform retrieval
    all_context = rag.get_context(batch_queries=batch_queries, serialize=False)

    # Extract corpus IDs
    retrieved_documents = [
        [str(doc.metadata.get("corpus_id", "")).strip() for doc in docs]
        for docs in all_context
    ]

    # Serialize context
    serialized_context = rag.serialize_documents(all_context)

    return {
        "prompts": [
            [
                {"role": "system", "content": INSTRUCTIONS},
                {
                    "role": "user",
                    "content": f"Financial Report Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
                },
            ]
            for question, context in zip(batch_queries, serialized_context)
        ],
        "retrieved_documents": retrieved_documents,
        **{k: list(v) for k, v in batch.items()},
    }


Postprocess (attach ground truth)

In [11]:
def sample_postprocess_fn(batch: Dict[str, listtype]) -> Dict[str, listtype]:
    """Add ground truth documents to batch"""

    gt_docs = []
    for qid in batch["query_id"]:
        target_qid = str(qid).strip()
        # Get only RELEVANT documents (relevance = 1)
        relevant = qrels_df[
            (qrels_df["query_id"] == target_qid) &
            (qrels_df["relevance"] == 1)
        ]["corpus_id"].tolist()
        gt_docs.append(relevant)

    batch["ground_truth_documents"] = gt_docs
    return batch

Metrics (Precision / Recall / MRR / NDCG)

In [12]:
def compute_ndcg_at_k(retrieved_docs, expected_docs, k=5):
    """Compute NDCG@k for ranked retrieval"""
    relevance = [1 if doc in expected_docs else 0 for doc in list(retrieved_docs)[:k]]
    dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(relevance))

    ideal_length = min(k, len(expected_docs))
    idcg = sum(1 / math.log2(i + 2) for i in range(ideal_length))

    return dcg / idcg if idcg > 0 else 0.0


def sample_compute_metrics_fn(batch: Dict[str, listtype]) -> Dict[str, Dict[str, Any]]:
    """Compute retrieval metrics per batch"""

    precisions, recalls, f1s, ndcgs, rrs, hits = [], [], [], [], [], []
    total_queries = len(batch["query"])

    for pred, gt in zip(batch["retrieved_documents"], batch["ground_truth_documents"]):
        # Ensure string type and strip
        actual = set(str(p).strip() for p in pred)
        expected = set(str(g).strip() for g in gt)

        if not expected:
            precisions.append(0)
            recalls.append(0)
            f1s.append(0)
            ndcgs.append(0)
            rrs.append(0)
            hits.append(0)
            continue

        # True positives
        tp = len(actual.intersection(expected))

        # Metrics
        precision = tp / len(actual) if actual else 0
        precisions.append(precision)

        recall = tp / len(expected) if expected else 0
        recalls.append(recall)

        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
        f1s.append(f1)

        ndcg = compute_ndcg_at_k(pred, expected, k=8)  # k=8 since we retrieve 8
        ndcgs.append(ndcg)

        hit = 1 if tp > 0 else 0
        hits.append(hit)

        rr = 0
        for j, p in enumerate(pred):
            if str(p).strip() in expected:
                rr = 1 / (j + 1)
                break
        rrs.append(rr)

    return {
        "Total": {"value": total_queries},
        "Hit_Rate": {"value": sum(hits) / total_queries if total_queries > 0 else 0},
        "Precision": {"value": sum(precisions) / total_queries if total_queries > 0 else 0},
        "Recall": {"value": sum(recalls) / total_queries if total_queries > 0 else 0},
        "F1_Score": {"value": sum(f1s) / total_queries if total_queries > 0 else 0},
        "NDCG@8": {"value": sum(ndcgs) / total_queries if total_queries > 0 else 0},
        "MRR": {"value": sum(rrs) / total_queries if total_queries > 0 else 0},
    }


def sample_accumulate_metrics_fn(
    aggregated_metrics: Dict[str, listtype]
) -> Dict[str, Dict[str, Any]]:
    """Accumulate metrics across all batches"""

    num_queries_per_batch = [m["value"] for m in aggregated_metrics["Total"]]
    total_queries = sum(num_queries_per_batch)
    metrics = ["Hit_Rate", "Precision", "Recall", "F1_Score", "NDCG@8", "MRR"]

    return {
        "Total": {"value": total_queries},
        **{
            m: {
                "value": sum(
                    v["value"] * queries
                    for v, queries in zip(aggregated_metrics[m], num_queries_per_batch)
                ) / total_queries if total_queries > 0 else 0,
                "is_algebraic": True,
                "value_range": (0, 1),
            }
            for m in metrics
        }
    }

vLLM Config + GridSearch

In [13]:
print(" Configuring vLLM model...")

vllm_config = RFvLLMModelConfig(
    model_config={
        "model": "Qwen/Qwen2.5-0.5B-Instruct",
        "dtype": "half",
        "gpu_memory_utilization": 0.25,
        "enforce_eager": True,
        "max_model_len": 4096,  # Longer for multi-doc questions
        "disable_log_stats": True,
        "tensor_parallel_size": 1,
        "distributed_executor_backend": "mp",
    },
    sampling_params={
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 512,  # Longer answers for complex financial questions
    },
    rag=rag_config,
)

config_set = {
    "vllm_config": vllm_config,
    "batch_size": 3,  # Smaller batch for longer contexts
    "preprocess_fn": sample_preprocess_fn,
    "postprocess_fn": sample_postprocess_fn,
    "compute_metrics_fn": sample_compute_metrics_fn,
    "accumulate_metrics_fn": sample_accumulate_metrics_fn,
    "online_strategy_kwargs": {
        "strategy_name": "normal",
        "confidence_level": 0.95,
        "use_fpc": True,
    },
}

config_group = RFGridSearch(config_set)

 Configuring vLLM model...


Run Experiment

In [14]:
print("\n" + "="*70)
print("üöÄ STARTING DOCUGAMI FINANCIAL QA BENCHMARK EVALUATION")
print("="*70)
print(f"   Dataset: {len(queries_df)} queries")
print(f"   Corpus: {len(corpus_list)} document chunks")
print(f"   QRELS: {len(qrels_df)} entries ({len(qrels_df[qrels_df['relevance']==1])} relevant)")
print(f"   Question Types: {len(question_type_counts)} types")
print("="*70 + "\n")

# Convert to HuggingFace Dataset
queries_dataset = Dataset.from_pandas(queries_df)

# Create experiment
experiment = Experiment(
    experiment_name="docugami-financial-rag-benchmark",
    mode="evals",
)

# Run evaluation
results = experiment.run_evals(
    config_group=config_group,
    dataset=queries_dataset,
    num_actors=1,
    num_shards=4,
    seed=42
)

experiment.end()


üöÄ STARTING DOCUGAMI FINANCIAL QA BENCHMARK EVALUATION
   Dataset: 195 queries
   Corpus: 4008 document chunks
   QRELS: 80431 entries (78481 relevant)
   Question Types: 3 types

An experiment with the same name already exists. Created a new experiment 'docugami-financial-rag-benchmark_1' with Experiment ID: 2 at /content/rapidfireai/rapidfire_experiments/docugami-financial-rag-benchmark_1
Created directory: /content/rapidfireai/logs/docugami-financial-rag-benchmark_1
üåê Google Colab detected. Ray dashboard URL: https://8855-gpu-t4-hm-7t8620vazzir-c.europe-west4-2.prod.colab.dev
üåê Google Colab detected. Dispatcher URL: https://8851-gpu-t4-hm-7t8620vazzir-c.europe-west4-2.prod.colab.dev


=== Preprocessing RAG Sources ===


RAG Source ID,Status,Duration,Details
1,Complete,35.9s,"FAISS, GPU"



=== Multi-Config Experiment Progress ===


Run ID,Model,Status,Progress,Conf. Interval,search_type,rag_k,top_n,chunk_size,chunk_overlap,sampling_params,model_config,Precision,Recall,MRR,Throughput,Total,Samples Processed,F1_Score,Hit_Rate,NDCG@8,Processing Time,Samples Per Second,model_name,run_id
1,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,8.0,5.0,512.0,128.0,"{'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 512}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'enforce_eager': True, 'max_model_len': 4096, 'disable_log_stats': True, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp'}","40.58% [40.58%, 40.58%]","0.95% [0.95%, 0.95%]","59.60% [59.60%, 59.60%]",0.2/s,195,195,"0.0181 [0.0181, 0.0181]","0.8821 [0.8821, 0.8821]","0.4169 [0.4169, 0.4169]",964.65 seconds,0.2,Qwen/Qwen2.5-0.5B-Instruct,1.0


Experiment docugami-financial-rag-benchmark_1 ended


Results table

In [15]:
print("\n" + "="*70)
print(" DOCUGAMI FINANCIAL QA BENCHMARK EVALUATION COMPLETE")
print("="*70)

if results:
    results_df = pd.DataFrame([
        {
            k: v['value'] if isinstance(v, dict) and 'value' in v else v
            for k, v in {**metrics_dict, 'run_id': run_id}.items()
        }
        for run_id, (_, metrics_dict) in results.items()
    ])

    print("\n OVERALL RESULTS:")
    print(results_df.to_string(index=False))

    print("\n METRIC INTERPRETATION:")
    print("   ‚Ä¢ Hit Rate: % of queries where at least 1 relevant chunk was retrieved")
    print("   ‚Ä¢ Precision: Fraction of retrieved chunks that are relevant")
    print("   ‚Ä¢ Recall: Fraction of relevant chunks that were retrieved")
    print("   ‚Ä¢ F1 Score: Harmonic mean of precision and recall")
    print("   ‚Ä¢ NDCG@8: Ranking quality considering position of relevant chunks")
    print("   ‚Ä¢ MRR: Mean reciprocal rank of first relevant chunk")

    print("\n" + "="*70)
    print(" BENCHMARK CHARACTERISTICS:")
    print("="*70)
    print(f"   ‚Ä¢ Multi-Document Support: Yes")
    print(f"   ‚Ä¢ Multi-Chunk Support:  Yes")
    print(f"   ‚Ä¢ Long-Form Documents:  Real 10-Q reports")
    print(f"   ‚Ä¢ Varying Difficulty:  Single-doc and multi-doc questions")
    print(f"   ‚Ä¢ Manual Curation:  Expert-reviewed answers")
    print("="*70)

else:
    print("\n No results returned from experiment")

print("="*70)


 DOCUGAMI FINANCIAL QA BENCHMARK EVALUATION COMPLETE

 OVERALL RESULTS:
 run_id                 model_name search_type  rag_k  top_n  chunk_size  chunk_overlap                                        sampling_params                                                                                                                                                                                model_config  Samples Processed Processing Time Samples Per Second  Total  Hit_Rate  Precision  Recall  F1_Score   NDCG@8      MRR
      1 Qwen/Qwen2.5-0.5B-Instruct  similarity      8      5         512            128 {'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 512} {'dtype': 'half', 'gpu_memory_utilization': 0.25, 'enforce_eager': True, 'max_model_len': 4096, 'disable_log_stats': True, 'tensor_parallel_size': 1, 'distributed_executor_backend': 'mp'}                195  964.65 seconds               0.20    195  0.882051   0.405769 0.00945  0.018132 0.416882 0.596007

 METRIC INTERPRETATION:
   ‚