### RapidFire AI Tutorial Use Case: GRPO for Math Reasoning

In [1]:
import os
os.environ["HF_HOME"] = "/dev/shm/huggingface"

In [2]:
from rapidfireai import Experiment
from rapidfireai.automl import List, RFGridSearch, RFModelConfig, RFLoraConfig, RFGRPOConfig

### Load Dataset and Specify Train and Eval Partitions

In [3]:
from datasets import load_dataset, Dataset

def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] 
    return data 

# Select a subset of the dataset for demo purposes
train_dataset = get_gsm8k_questions(split="train").select(range(128))
eval_dataset = get_gsm8k_questions(split="test").select(range(24))
train_dataset = train_dataset.shuffle(seed=42)
eval_dataset =  eval_dataset.shuffle(seed=42)

### Define Data Processing Function

In [4]:
def sample_formatting_function(row):
    """Function to preprocess each example from dataset"""

    def extract_hash_answer(text: str) -> str | None:
        if "####" not in text:
            return None
        answer = text.split("####")[1].strip()
        try:
            answer = answer.replace(",", "")
        except:
            return None
        return answer
        
    SYSTEM_PROMPT = """
    Respond in the following format:
    <reasoning>
    ...
    </reasoning>
    <answer>
    ...
    </answer>
    """
    return { # Return a conversation format dictionary
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': row['question']}
        ],
        'question': row['question'],
        'answer': extract_hash_answer(row['answer'])
    }

### Initialize Experiment

In [5]:
# Every experiment instance must be uniquely named
experiment = Experiment(experiment_name="exp1-math-reasoning")

No active MLflow run to clear


2026/01/29 07:44:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/01/29 07:44:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/01/29 07:44:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/01/29 07:44:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/01/29 07:44:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/01/29 07:44:06 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/01/29 07:44:06 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/29 07:44:06 INFO mlflow.store.db.utils: Updating database tables
2026/01/29 07:44:06 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/29 07:44:06 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/29 07:44:06 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/29 07:44:06 INFO alembic.runtime

The previously running experiment exp1-math-reasoning_50 was forcibly ended. Created a new experiment with name 'exp1-math-reasoning_51' with Experiment ID: 55 and MLFlow Experiment ID: 55 saved at /home/palebluedot/rapidfireai/tutorial_notebooks/grpo_mathreasoning/rapidfire_experiments/exp1-math-reasoning_51


#### Define Custom Reward Functions

In [6]:
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:

    def extract_xml_answer(text: str) -> str:
        answer = text.split("<answer>")[-1]
        answer = answer.split("</answer>")[0]
        return answer.strip()

    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    
    def extract_xml_answer(text: str) -> str:
        answer = text.split("<answer>")[-1]
        answer = answer.split("</answer>")[0]
        return answer.strip()
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    import re
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    import re
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    def count_xml(text) -> float:
        count = 0.0
        if text.count("<reasoning>\n") == 1:
            count += 0.125
        if text.count("\n</reasoning>\n") == 1:
            count += 0.125
        if text.count("\n<answer>\n") == 1:
            count += 0.125
            count -= len(text.split("\n</answer>\n")[-1])*0.001
        if text.count("\n</answer>") == 1:
            count += 0.125
            count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
        return count
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

### Define Multi-Config Knobs for Model, LoRA, and GRPO Trainer using RapidFire AI Wrapper APIs

In [7]:
lora_config = RFLoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        bias="none"
    )

grpo_config1 = RFGRPOConfig(
    use_vllm=True,
    vllm_mode="colocate",
    vllm_gpu_memory_utilization=0.1,
    vllm_tensor_parallel_size=1,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    weight_decay=0.1,
    max_grad_norm=0.1,
    adam_beta1=0.9,
    adam_beta2=0.99,
    lr_scheduler_type = "linear",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1, 
    num_generations=4,
    optim ="adamw_torch",
    num_train_epochs=1,
    max_prompt_length=1024,
    max_completion_length=1024,
    logging_steps=2,
    log_level="error",
    eval_steps=5,
    torch_compile=False,
    bf16=True,
    fp16=False,
    fsdp="full_shard auto_wrap",
    fsdp_config={"backward_prefetch": "backward_pre","forward_prefetch": True,"use_orig_params": True,  "cpu_ram_efficient_loading": True,"offload_params":True,"sync_module_states": True,"min_num_params": 1000000,"limit_all_gathers": True, "sharding_strategy": "FULL_SHARD",
        "auto_wrap_policy": "TRANSFORMER_BASED_WRAP"}
)

grpo_config2 = grpo_config1.copy()
grpo_config2.learning_rate = 1e-5

reward_funcs = [
    correctness_reward_func,
    int_reward_func,
    strict_format_reward_func,
    soft_format_reward_func,
    xmlcount_reward_func,
]

# List of 4 separate configs
config_set = List([
    RFModelConfig(
        model_name="Qwen/Qwen2.5-0.5B-Instruct",
        peft_config=lora_config,
        training_args=grpo_config1,
        formatting_func=sample_formatting_function,
        reward_funcs=reward_funcs,
        model_kwargs={"device_map": None, "torch_dtype": "bfloat16"},
        tokenizer_kwargs={"model_max_length": 2048, "padding_side": "left", "truncation": True}
    ),
    RFModelConfig(
        model_name="Qwen/Qwen2.5-0.5B-Instruct",
        peft_config=lora_config,
        training_args=grpo_config2,
        formatting_func=sample_formatting_function,
        reward_funcs=reward_funcs,
        model_kwargs={"device_map": "auto", "torch_dtype": "auto", "use_cache": False},
        tokenizer_kwargs={"model_max_length": 2048, "padding_side": "left", "truncation": True}
    )
])



#### Define Model Creation Function

In [8]:
def sample_create_model(model_config):
   """Function to create model object for any given config; must return tuple of (model, tokenizer)"""
   from transformers import AutoModelForCausalLM, AutoTokenizer
   
   model_name = model_config["model_name"]
   model_kwargs = model_config["model_kwargs"]
   tokenizer_kwargs = model_config["tokenizer_kwargs"]
   return (
      AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs, attn_implementation="eager"),
      AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
   )

#### Generate Config Group

In [None]:
# Simple grid search across all sets of config knob values = 4 combinations in total
config_group = RFGridSearch(
    configs=config_set,
    trainer_type="GRPO",
)

: 

### Run Multi-Config Training

In [None]:
# Launch training of all configs in the config_group with swap granularity of 4 chunks
experiment.run_fit(config_group, sample_create_model, train_dataset, eval_dataset, num_chunks=4,num_gpus=2, seed=42)

Started 2 worker processes successfully
Created workers


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.44it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.43it/s]
[0;36m(EngineCore_DP0 pid=1156003)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.80it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.79it/s]
[0;36m(EngineCore_DP0 pid=1156000)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:00<00:00, 56.85it/s][0;36m(EngineCore_DP0 pid=1156000)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:00<00:00, 57.36it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:00<00:00, 61.07it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:00<00:00, 60.49it/

Run 1 has failed: vLLM subprocess error: 'LLMEngine' object has no attribute 'model_executor'Traceback (most recent call last):
  File "/home/palebluedot/rapidfireai/rapidfireai/backend/worker.py", line 498, in serve_forever
    self.run_fit(run_id, chunk_id, multi_worker_details, create_model_fn)
  File "/home/palebluedot/rapidfireai/rapidfireai/backend/worker.py", line 321, in run_fit
    trainer_instance.train()
  File "/home/palebluedot/rapidfireai/.venv/lib/python3.12/site-packages/transformers/trainer.py", line 2325, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/palebluedot/rapidfireai/.venv/lib/python3.12/site-packages/transformers/trainer.py", line 2674, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/palebluedot/rapidfireai/.venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 1168, in t

### End Current Experiment

In [None]:
experiment.end()