### Library imports

In [None]:
import json
import re
from typing import Any, Dict, List

from datasets import load_dataset

from rapidfireai.infer.experiment import Experiment

### Model config and Sampling Config

In [None]:
batch_size = 128

### Dataset

In [None]:
# Use test split for evaluation (not train)
dataset = load_dataset("openai/gsm8k", "main", split="train")
print(f"Loaded {len(dataset)} test samples")

### RAG Implementation using `LangChainRagSpec`

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from rapidfireai.infer.rag.context_generator import ContextGenerator
from rapidfireai.infer.rag.rag_pipeline import LangChainRagSpec
from rapidfireai.infer.utils.config import VLLMModelConfig

#### CPU RAG

In [None]:
rag_cpu = LangChainRagSpec(
    document_loader=DirectoryLoader(
        path="../data/gsm8k",
        glob="*.txt",
        recursive=True,
        sample_seed=1337
    ),
    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        encoding_name="gpt2", chunk_size=128, chunk_overlap=32
    ),
    embedding_cls=HuggingFaceEmbeddings,
    embedding_kwargs={
        'model_name': "sentence-transformers/all-MiniLM-L6-v2",
        'model_kwargs': {'device': 'cpu'},
        'encode_kwargs': {'normalize_embeddings': True, 'batch_size': batch_size}
    },
    retriever=None,
    vector_store=None, # Uses FAISS CPU with HNSW approximate nearest neighbor search
    search_type="similarity",
    search_kwargs={"k": 3},
    reranker=None,
)

#### GPU RAG

In [None]:
rag_gpu = LangChainRagSpec(
    document_loader=DirectoryLoader(
        path="../data/gsm8k",
        glob="*.txt",
        recursive=True,
        sample_seed=1337
    ),
    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        encoding_name="gpt2", chunk_size=128, chunk_overlap=32
    ),
    embedding_cls=HuggingFaceEmbeddings,
    embedding_kwargs={
        'model_name': "sentence-transformers/all-MiniLM-L6-v2",
        'model_kwargs': {'device': 'cuda:0'},
        'encode_kwargs': {'normalize_embeddings': True, 'batch_size': batch_size}
    },
    retriever=None,
    vector_store=None, # Uses FAISS GPU with indexed exact search when `enable_gpu_search=True` is set
    search_type="similarity",
    search_kwargs={"k": 3},
    reranker=None,
    enable_gpu_search=True # Search computation is performed on GPU
)

In [None]:
# Create ContextGenerators wrapping RAG specs
context_generator_cpu = ContextGenerator(rag_spec=rag_cpu)
context_generator_gpu = ContextGenerator(rag_spec=rag_gpu)

model_config={
    "model": "Qwen/Qwen2.5-3B-Instruct",
    "dtype": "half",
    "gpu_memory_utilization": 0.7,
    "tensor_parallel_size": 1,
    "distributed_executor_backend": "mp",
    "enable_chunked_prefill": True,
    "enable_prefix_caching": True,
    "max_model_len": 2048,
    "disable_log_stats": True,  # Disable VLLM progress logging
}
sampling_params={
    "temperature": 0.8,
    "top_p": 0.95,
    "max_tokens": 512,
}

pipeline_cpu = VLLMModelConfig(
    model_config=model_config,
    sampling_params=sampling_params,
    context_generator=context_generator_cpu
)

pipeline_gpu = VLLMModelConfig(
    model_config=model_config,
    sampling_params=sampling_params,
    context_generator=context_generator_gpu
)


### Utility, Preprocessor, Postprocessor, Compute Metrics

In [None]:
def extract_solution(answer):
    solution = re.search("#### (\\-?[0-9\\.\\,]+)", answer)
    if solution is None:
        return "0"
    final_solution = solution.group(0)
    final_solution = final_solution.split("#### ")[1].replace(",", "")
    return final_solution

def preprocess_fn(batch: Dict[str, List], context_generator: ContextGenerator) -> Dict[str, List]:
    return {
        "prompts": [
            [
                {
                    "role": "system",
                    "content": 'Let\'s think step by step and output the final answer after "####".'
                },
                {
                    "role": "user",
                    "content": f'{question}. You can use the following context to answer the question:\n{context}'
                }
            ]
            for question, context in zip(batch["question"], context_generator.get_context(batch["question"]))
        ],
        **batch,
    }

def postprocess_fn(batch: Dict[str, List]) -> Dict[str, List]:
    batch["model_answer"] = [extract_solution(answer) for answer in batch["generated_text"]]
    batch["ground_truth"] = [extract_solution(answer) for answer in batch["answer"]]
    return batch

def compute_metrics_fn(batch: Dict[str, List]) -> Dict[str, Dict[str, Any]]:
    correct = sum(1 for pred, gt in zip(batch["model_answer"], batch["ground_truth"])
                  if pred == gt)
    total = len(batch["model_answer"])
    return {
        "Correct": {"value": correct},
        "Total": {"value": total},
    }

def accumulate_metrics_fn(aggregated_metrics: Dict[str, List]) -> Dict[str, Dict[str, Any]]:
    # aggregated_metrics is a dict of lists: {"Correct": [5, 3, 7], "Total": [10, 8, 12]}
    correct = sum(m.get("value", 0) for m in aggregated_metrics.get("Correct", [{}]))
    total = sum(m.get("value", 0) for m in aggregated_metrics.get("Total", [{}]))
    accuracy = correct / total if total > 0 else 0
    return {
        "Total": {"value": total},
        "Correct": {"value": correct, "is_distributive": True, "value_range": (0, 1)}, # 0 (min) if not correct, 1 if correct (max)
        "Accuracy": {"value": accuracy, "is_algebraic": True, "value_range": (0, 1)} # Algebraic metric for online aggregation
    }

### Create Experiment

In [None]:
experiment = Experiment(experiment_name="trial-rag-langchain", num_actors=8)

### Run Experiment

#### CPU based execution

In [None]:
aggregated_results, metrics = experiment.run_evals(
    pipeline_cpu,
    dataset,
    batch_size=batch_size,  # Per actor batch size
    preprocess_fn=preprocess_fn,
    postprocess_fn=postprocess_fn,
    compute_metrics_fn=compute_metrics_fn,
    accumulate_metrics_fn=accumulate_metrics_fn,
    online_strategy_kwargs={"strategy_name": "normal", "confidence_level": 0.95, "use_fpc": True},
)

#### GPU based execution

In [None]:
aggregated_results, metrics = experiment.run_evals(
    pipeline_gpu,
    dataset,
    batch_size=batch_size,  # Per actor batch size
    preprocess_fn=preprocess_fn,
    postprocess_fn=postprocess_fn,
    compute_metrics_fn=compute_metrics_fn,
    accumulate_metrics_fn=accumulate_metrics_fn,
    online_strategy_kwargs={"strategy_name": "normal", "confidence_level": 0.95, "use_fpc": True},
)

### End Experiment

In [None]:
experiment.end()

### View Results

In [None]:
print(f"\nResults:")
print(json.dumps(metrics, indent=4))

In [None]:
print(f"\nFirst few examples:")
for i in range(min(3, metrics['Samples Processed']['value'])):
    print(f"\nExample {i+1}:")
    prompt = aggregated_results['prompts'][i]
    print(f"System: \n{prompt[0]['content']}")
    print(f"User + Context: \n{prompt[1]['content']}")
    print(f"Model: \n{aggregated_results['generated_text'][i]}")
    print(f"Ground truth: \n{aggregated_results['ground_truth'][i]}")
    print(f"Model answer: \n{aggregated_results['model_answer'][i]}")
