### Library imports

In [None]:
import json
import re
from typing import Any, Dict, List

from datasets import load_dataset

from rapidfireai.infer.experiment import Experiment
from rapidfireai.infer.rag.context_generator import ContextGenerator

INFO 10-08 23:12:35 __init__.py:190] Automatically detected platform cuda.


INFO:datasets:PyTorch version 2.5.1+cu124 available.


### Model config and Sampling Config

In [None]:
from rapidfireai.infer.utils.config import VLLMModelConfig

pipeline = VLLMModelConfig(
    model_config={
        "model": "Qwen/Qwen2.5-3B-Instruct",
        "dtype": "half",
        "gpu_memory_utilization": 0.7,
        "tensor_parallel_size": 1,
        "distributed_executor_backend": "mp",
        "enable_chunked_prefill": True,
        "enable_prefix_caching": True,
        "max_model_len": 2048,
        "disable_log_stats": True,  # Disable VLLM progress logging
    },
    sampling_params={
        "temperature": 0.8,
        "top_p": 0.95,
        "max_tokens": 512,
    },
    context_generator=None
)

### Dataset

In [3]:
# Use test split for evaluation (not train)
dataset = load_dataset("openai/gsm8k", "main", split="train")
print(f"Loaded {len(dataset)} test samples")

Loaded 7473 test samples


### Utility, Preprocessor, Postprocessor, Compute Metrics

In [None]:
def extract_solution(answer):
    solution = re.search("#### (\\-?[0-9\\.\\,]+)", answer)
    if solution is None:
        return "0"
    final_solution = solution.group(0)
    final_solution = final_solution.split("#### ")[1].replace(",", "")
    return final_solution

def preprocess_fn(batch: Dict[str, List], context_generator: ContextGenerator) -> Dict[str, List]:
    return {
        "prompts": [
            [
                {"role": "system", "content": 'Let\'s think step by step and output the final answer after "####".'},
                {"role": "user", "content": question}
            ]
            for question in batch["question"]
        ],
        **batch,
    }

def postprocess_fn(batch: Dict[str, List]) -> Dict[str, List]:
    batch["model_answer"] = [extract_solution(answer) for answer in batch["generated_text"]]
    batch["ground_truth"] = [extract_solution(answer) for answer in batch["answer"]]
    return batch

def compute_metrics_fn(batch: Dict[str, List]) -> Dict[str, Dict[str, Any]]:
    correct = sum(1 for pred, gt in zip(batch["model_answer"], batch["ground_truth"])
                  if pred == gt)
    total = len(batch["model_answer"])
    return {
        "Correct": {"value": correct},
        "Total": {"value": total},
    }

def accumulate_metrics_fn(aggregated_metrics: Dict[str, List]) -> Dict[str, Dict[str, Any]]:
    # aggregated_metrics is a dict of lists: {"Correct": [5, 3, 7], "Total": [10, 8, 12]}
    correct = sum(m.get("value", 0) for m in aggregated_metrics.get("Correct", [{}]))
    total = sum(m.get("value", 0) for m in aggregated_metrics.get("Total", [{}]))
    accuracy = correct / total if total > 0 else 0
    return {
        "Total": {"value": total},
        "Correct": {"value": correct, "is_distributive": True, "value_range": (0, 1)}, # 0 (min) if not correct, 1 if correct (max)
        "Accuracy": {"value": accuracy, "is_algebraic": True, "value_range": (0, 1)} # Algebraic metric for online aggregation
    }

### Create Experiment

In [None]:
experiment = Experiment(experiment_name="trial-infer", num_actors=8)

Ray cluster resources: {
    "object_store_memory": 114952562688.0,
    "memory": 268222646272.0,
    "node:172.31.6.151": 1.0,
    "node:__internal_head__": 1.0,
    "GPU": 8.0,
    "accelerator_type:T4": 1.0,
    "CPU": 96.0
}


### Run Experiment

In [6]:
aggregated_results, metrics = experiment.run_evals(
    pipeline,
    dataset,
    batch_size=128,  # Per actor batch size
    preprocess_fn=preprocess_fn,
    postprocess_fn=postprocess_fn,
    compute_metrics_fn=compute_metrics_fn,
    accumulate_metrics_fn=accumulate_metrics_fn,
    online_strategy_kwargs={"strategy_name": "normal", "confidence_level": 0.95, "use_fpc": True}
)

Creating 8 model actors with 1 GPU and 8 CPUs each
Split 7473 samples into 59 batches of size 128


Completing batches: 100%|██████████| 59/59 [05:28<00:00,  5.57s/batch], Live Metrics: Total=7473, Correct=293.0000±0.0000, Accuracy=0.0392±0.0000  

All batches completed.
Accumulating batch-level metrics offline

Shutting down actors





### End Experiment

In [7]:
experiment.end()

All actors shut down


### View Results

In [None]:
print(f"\nResults:")
print(json.dumps(metrics, indent=4))


Results:
{
    "Samples Processed": {
        "value": 7473,
        "is_algebraic": false
    },
    "Processing Time": {
        "value": "328.94 seconds",
        "is_algebraic": false
    },
    "Samples Per Second": {
        "value": "22.72",
        "is_algebraic": false
    },
    "Total": {
        "value": 7473
    },
    "Correct": {
        "value": 293,
        "is_distributive": true,
        "value_range": [
            0,
            1
        ]
    },
    "Accuracy": {
        "value": 0.03920781479994647,
        "is_algebraic": true,
        "value_range": [
            0,
            1
        ]
    }
}


In [9]:
print(f"\nFirst few examples:")
for i in range(min(3, metrics['Samples Processed']['value'])):
    print(f"\nExample {i+1}:")
    print(f"Question: {aggregated_results['question'][i]}")
    print(f"Ground truth: {aggregated_results['ground_truth'][i]}")
    print(f"Model answer: {aggregated_results['model_answer'][i]}")
    print(f"Generated text: {aggregated_results['generated_text'][i]}")


First few examples:

Example 1:
Question: Elysse can carry 3 bags of groceries into her home with each trip from the car. Her brother can carry the same amount. How many trips will it take them to carry 30 bags of groceries?
Ground truth: 5
Model answer: 0
Generated text: To find out how many trips Elysse and her brother need to carry 30 bags of groceries, we can follow these steps:

1. Determine how many bags they can carry together in one trip:
   Elysse can carry 3 bags per trip, and her brother can carry 3 bags per trip.
   Together, they can carry \( 3 + 3 = 6 \) bags per trip.

2. Calculate how many trips they need to carry 30 bags:
   Since they can carry 6 bags per trip, we divide the total number of bags by the number of bags they can carry per trip:
   \[
   \frac{30 \text{ bags}}{6 \text{ bags per trip}} = 5 \text{ trips}
   \]

Therefore, it will take them 5 trips to carry 30 bags of groceries.

Example 2:
Question: Jennifer has ten pears, 20 oranges, and twice as many app