Install & setup

In [1]:
try:
    import rapidfireai
    print("rapidfireai installed")
except ImportError:
    !pip install rapidfireai datasets==3.6.0 langchain sentence-transformers
    !rapidfireai init --evals

rapidfireai installed


Imports

In [2]:
import os
import math
import json
import random
import pandas as pd
from pathlib import Path
from typing import List as listtype, Dict, Any
from datasets import load_dataset

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

from rapidfireai import Experiment
from rapidfireai.automl import List, RFLangChainRagSpec, RFvLLMModelConfig, RFPromptManager, RFGridSearch

Load & prepare dataset, Build corpus + queries + qrels

In [3]:
import json
import random
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from collections import defaultdict

# Setup
dataset_dir = Path("./electronics_rag")
dataset_dir.mkdir(exist_ok=True)

# Load and Filter
raw_dataset = load_dataset("buruzaemon/amazon_reviews_multi", "en", split="train")
electronics_data = raw_dataset.filter(lambda x: "electronics" in x["product_category"].lower())

# Downsample (Using a larger set to ensure product overlaps)
sample_size = 100
rseed = 42
random.seed(rseed)
sampled_data = electronics_data.shuffle(seed=rseed).select(range(sample_size))

# Grouping Logic to know which documents belong to which product
product_to_docs = defaultdict(list)
corpus_list = []
queries_list = []

for i, row in enumerate(sampled_data):
    doc_id = f"doc_{i}"
    query_id = f"q_{i}"
    prod_id = str(row['product_id'])

    # Store the document
    corpus_list.append({"_id": doc_id, "text": row["review_body"]})

    # Store the query (using title)
    queries_list.append({"query_id": query_id, "query": row["review_title"]})

    # Map this document to its product group
    product_to_docs[prod_id].append(doc_id)

# Build Expanded QRELS
qrels_rows = []
for i, row in enumerate(sampled_data):
    query_id = f"q_{i}"
    prod_id = str(row['product_id'])

    # Every document sharing this product_id is now a "correct" answer
    relevant_docs = product_to_docs[prod_id]

    for d_id in relevant_docs:
        qrels_rows.append({
            "query_id": query_id,
            "corpus_id": d_id,
            "relevance": 1
        })

# Save and Finalize
corpus_file = dataset_dir / "corpus_sampled.jsonl"
with open(corpus_file, "w") as f:
    for doc in corpus_list:
        f.write(json.dumps(doc) + "\n")

electronics_dataset = pd.DataFrame(queries_list).astype(str)
qrels = pd.DataFrame(qrels_rows).astype(str)

print(f"Prepared {len(corpus_list)} documents.")
print(f"Expanded QRELS: {len(qrels)} relevance pairs (Multiple reviews per product).")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Prepared 100 documents.
Expanded QRELS: 100 relevance pairs (Multiple reviews per product).


Define RAG search space (retrieval-focused)

In [4]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker

# Batch size for embedding model hardware efficiency
batch_size = 50

rag_gpu = RFLangChainRagSpec(
    document_loader=DirectoryLoader(
        path=str(dataset_dir),
        glob="corpus_sampled.jsonl",
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "text",
            "metadata_func": lambda record, metadata: {
                "corpus_id": str(record.get("_id"))
            },
            "json_lines": True,
            "text_content": False,
        },
        sample_seed=42,
    ),

    text_splitter=List([
            RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=32),
            RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=32),
        ],
    ),
    embedding_cls=HuggingFaceEmbeddings,
    embedding_kwargs={
        "model_name": "sentence-transformers/all-MiniLM-L6-v2",
        "model_kwargs": {"device": "cuda:0"},
        "encode_kwargs": {"normalize_embeddings": True, "batch_size": batch_size},
    },
    vector_store=None,  # Defaults to FAISS
    search_type="similarity",
    search_kwargs={"k": 20},

    reranker_cls=CrossEncoderReranker,
    reranker_kwargs={
        "model_name": "cross-encoder/ms-marco-MiniLM-L6-v2",
        "model_kwargs": {"device": "cpu"},
        "top_n": List([5, 10]),
    },
    enable_gpu_search=True,
)

Preprocess (retrieval-only focus)

In [5]:
def sample_preprocess_fn(batch: Dict[str, listtype], rag: RFLangChainRagSpec, prompt_manager: RFPromptManager) -> Dict[str, listtype]:
    INSTRUCTIONS = "Utilize your knowledge of electronics to answer the following question based on the provided reviews."

    batch_queries = [str(q).strip() for q in batch["query"]]

    # Perform retrieval
    all_context = rag.get_context(batch_queries=batch_queries, serialize=False)

    # Explicitly extract and cast IDs to strings to match QRELS
    retrieved_documents = [
        [str(doc.metadata.get("corpus_id", "")).strip() for doc in docs]
        for docs in all_context
    ]

    serialized_context = rag.serialize_documents(all_context)

    return {
        "prompts": [
            [
                {"role": "system", "content": INSTRUCTIONS},
                {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"},
            ]
            for question, context in zip(batch_queries, serialized_context)
        ],
        "retrieved_documents": retrieved_documents,

        **{k: list(v) for k, v in batch.items()},
    }


Postprocess (attach ground truth)

In [6]:
def sample_postprocess_fn(batch: Dict[str, listtype]) -> Dict[str, listtype]:
    # Ensure qrels is strings for comparison
    qrels['query_id'] = qrels['query_id'].astype(str).str.strip()
    qrels['corpus_id'] = qrels['corpus_id'].astype(str).str.strip()

    gt_docs = []
    for qid in batch["query_id"]:
        target_qid = str(qid).strip()
        relevant = qrels[qrels["query_id"] == target_qid]["corpus_id"].tolist()
        gt_docs.append(relevant)

    batch["ground_truth_documents"] = gt_docs
    return batch

Metrics (Precision / Recall / MRR / NDCG)

In [7]:
def compute_ndcg_at_k(retrieved_docs, expected_docs, k=5):
    relevance = [1 if doc in expected_docs else 0 for doc in list(retrieved_docs)[:k]]
    dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(relevance))
    ideal_length = min(k, len(expected_docs))
    idcg = sum(1 / math.log2(i + 2) for i in range(ideal_length))
    return dcg / idcg if idcg > 0 else 0.0

def sample_compute_metrics_fn(batch: Dict[str, listtype]) -> Dict[str, Dict[str, Any]]:
    precisions, recalls, ndcgs, rrs, hits = [], [], [], [], []
    total_queries = len(batch["query"])

    if total_queries > 0:
        print(f"DEBUG - Pred: {batch['retrieved_documents'][0]}")
        print(f"DEBUG - GT: {batch['ground_truth_documents'][0]}")

    for pred, gt in zip(batch["retrieved_documents"], batch["ground_truth_documents"]):
        actual = set(str(p).strip() for p in pred)
        expected = set(str(g).strip() for g in gt)

        if not expected:
            precisions.append(0); recalls.append(0); ndcgs.append(0); rrs.append(0); hits.append(0)
            continue

        tp = len(actual.intersection(expected))

        precisions.append(tp / len(actual) if actual else 0)
        recalls.append(tp / len(expected) if expected else 0)
        ndcgs.append(compute_ndcg_at_k(pred, expected, k=5))

        # Hit Rate: Did we get at least one review for the right product?
        hits.append(1 if tp > 0 else 0)

        # Reciprocal Rank calculation
        rr = 0
        for i, p in enumerate(pred):
            if str(p).strip() in expected:
                rr = 1 / (i + 1)
                break
        rrs.append(rr)

    return {
        "Total": {"value": total_queries},
        "Hit Rate": {"value": sum(hits) / total_queries}, # NEW
        "Precision": {"value": sum(precisions) / total_queries},
        "Recall": {"value": sum(recalls) / total_queries},
        "NDCG@5": {"value": sum(ndcgs) / total_queries},
        "MRR": {"value": sum(rrs) / total_queries},
    }

def sample_accumulate_metrics_fn(aggregated_metrics: Dict[str, listtype]) -> Dict[str, Dict[str, Any]]:
    total_queries = sum(m["value"] for m in aggregated_metrics["Total"])
    metrics = ["Hit Rate", "Precision", "Recall", "NDCG@5", "MRR"]

    return {
        "Total": {"value": total_queries},
        **{
            m: {
                "value": sum(v["value"] for v in aggregated_metrics[m]) / len(aggregated_metrics[m]),
                "is_algebraic": True
            } for m in metrics
        }
    }

Grid + Experiment

In [8]:
vllm_config = RFvLLMModelConfig(
    model_config={
        "model": "Qwen/Qwen2.5-0.5B-Instruct",
        "dtype": "half",
        "gpu_memory_utilization": 0.25,
        "enforce_eager": True,
        "max_model_len": 2048,
        "disable_log_stats": True,
    },
    sampling_params={
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 128,
    },
    rag=rag_gpu,
)

config_set = {
    "vllm_config": vllm_config,
    "batch_size": 4,
    "preprocess_fn": sample_preprocess_fn,
    "postprocess_fn": sample_postprocess_fn,
    "compute_metrics_fn": sample_compute_metrics_fn,
    "accumulate_metrics_fn": sample_accumulate_metrics_fn,
    "online_strategy_kwargs": {
        "strategy_name": "normal",
        "confidence_level": 0.95,
        "use_fpc": True,
    },
}

In [9]:
config_group = RFGridSearch(config_set)
experiment = Experiment(experiment_name="amazon-electronics-rag-v2", mode="evals")

results = experiment.run_evals(config_group=config_group, dataset=electronics_dataset, num_actors=1,num_shards=4,seed=42)

experiment.end()


Created directory: /content/rapidfireai/logs/amazon-electronics-rag-v2_2
The previously running experiment amazon-electronics-rag-v2_2 was forcibly ended. Created a new experiment 'amazon-electronics-rag-v2_3' with Experiment ID: 4 at /content/rapidfireai/rapidfire_experiments/amazon-electronics-rag-v2_3
üåê Google Colab detected. Ray dashboard URL: https://8855-gpu-t4-hm-1lpb172kzqkzg-c.asia-southeast1-1.prod.colab.dev
üåê Google Colab detected. Dispatcher URL: https://8851-gpu-t4-hm-1lpb172kzqkzg-c.asia-southeast1-1.prod.colab.dev


=== Preprocessing RAG Sources ===


RAG Source ID,Status,Duration,Details
1,Complete,23.2s,"FAISS, GPU"
2,Complete,23.8s,"FAISS, GPU"



=== Multi-Config Experiment Progress ===


Run ID,Model,Status,Progress,Conf. Interval,search_type,rag_k,top_n,chunk_size,chunk_overlap,sampling_params,model_config,Precision,Recall,NDCG@5,MRR,Throughput,Total,Samples Processed,Hit Rate,Processing Time,Samples Per Second,model_name,run_id
1,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,20.0,5.0,256.0,32.0,"{'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'enforce_eager': True, 'max_model_len': 2048, 'disable_log_stats': True}","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]",0.3/s,100,100,"0.0000 [0.0000, 0.0000]",438.55 seconds,0.23,Qwen/Qwen2.5-0.5B-Instruct,1.0
2,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,20.0,10.0,256.0,32.0,"{'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'enforce_eager': True, 'max_model_len': 2048, 'disable_log_stats': True}","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]",0.3/s,100,100,"0.0000 [0.0000, 0.0000]",351.66 seconds,0.28,Qwen/Qwen2.5-0.5B-Instruct,2.0
3,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,20.0,5.0,128.0,32.0,"{'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'enforce_eager': True, 'max_model_len': 2048, 'disable_log_stats': True}","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]",0.3/s,100,100,"0.0000 [0.0000, 0.0000]",325.98 seconds,0.31,Qwen/Qwen2.5-0.5B-Instruct,3.0
4,Qwen/Qwen2.5-0.5B-Instruct,COMPLETED,4/4,0.0,similarity,20.0,10.0,128.0,32.0,"{'temperature': 0.7, 'top_p': 0.95, 'max_tokens': 128}","{'dtype': 'half', 'gpu_memory_utilization': 0.25, 'enforce_eager': True, 'max_model_len': 2048, 'disable_log_stats': True}","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]","0.00% [0.00%, 0.00%]",0.3/s,100,100,"0.0000 [0.0000, 0.0000]",305.03 seconds,0.33,Qwen/Qwen2.5-0.5B-Instruct,4.0


Experiment amazon-electronics-rag-v2_3 ended


Results table

In [10]:
pd.DataFrame([{k: (v['value'] if isinstance(v, dict) else v) for k, v in {**m, 'run_id': rid}.items()} for rid, (_, m) in results.items()])

Unnamed: 0,run_id,model_name,search_type,rag_k,top_n,chunk_size,chunk_overlap,sampling_params,model_config,Samples Processed,Processing Time,Samples Per Second,Total,Hit Rate,Precision,Recall,NDCG@5,MRR
0,1,Qwen/Qwen2.5-0.5B-Instruct,similarity,20,5,256,32,"{'temperature': 0.7, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",100,438.55 seconds,0.23,100,0.0,0.0,0.0,0.0,0.0
1,2,Qwen/Qwen2.5-0.5B-Instruct,similarity,20,10,256,32,"{'temperature': 0.7, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",100,351.66 seconds,0.28,100,0.0,0.0,0.0,0.0,0.0
2,3,Qwen/Qwen2.5-0.5B-Instruct,similarity,20,5,128,32,"{'temperature': 0.7, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",100,325.98 seconds,0.31,100,0.0,0.0,0.0,0.0,0.0
3,4,Qwen/Qwen2.5-0.5B-Instruct,similarity,20,10,128,32,"{'temperature': 0.7, 'top_p': 0.95, 'max_token...","{'dtype': 'half', 'gpu_memory_utilization': 0....",100,305.03 seconds,0.33,100,0.0,0.0,0.0,0.0,0.0
