# GRPO Reasoning Training with Unsloth

This notebook demonstrates how to train a model to "think" (Reasoning/Chain-of-Thought) using **GRPO (Group Relative Policy Optimization)**.

We adapt a standard Instruct model (e.g., `Qwen2.5-3B-Instruct`) to output reasoning traces within `<think>` tags via Reinforcement Learning, using the **Open R1** approach.

**Hardware**: Designed to run on a free **Colab T4 (16GB RAM)** instance.

In [None]:
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm
else:
    # Specific versions for Colab T4 compatibility from Unsloth
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    
    # vLLM 0.9.2 is required for T4 to avoid 'fileno' and other issues
    get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
    !uv pip install -qqq --upgrade \
        unsloth {get_vllm} {get_numpy} {get_pil} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
!uv pip install transformers==4.56.2
!uv pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastLanguageModel, PatchFastRL
from unsloth import is_bfloat16_supported
import torch

# Patch GRPO for optimizations
PatchFastRL("GRPO", FastLanguageModel)

max_seq_length = 1024 # Can increase for longer reasoning chains
lora_rank = 32 # Larger rank = smarter, but slower
gpu_memory_utilization = 0.6 # Adjustable for T4 (0.6 is safe)

# Load Model - Using Qwen2.5-3B-Instruct for speed/memory efficiency on T4
# (You can switch to "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" if you have A100/L4)
model_name = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = gpu_memory_utilization,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context support
    random_state = 3407,
)

## Data Preparation
We use the `open-r1/DAPO-Math-17k-Processed` dataset (or GSM8K). We define a System Prompt that instructs the model to strict formatting.

In [None]:
from datasets import load_dataset

# System prompt to encourage thinking
system_prompt = """You are a helpful assistant. You are given a problem.
You must think about the problem and provide your working out inside <think> and </think> tags.
Then, provide the final answer."""

dataset = load_dataset("open-r1/DAPO-Math-17k-Processed", split = "train")

# Simple processing to format for GRPO
def process_data(x):
    return {
        "prompt": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": x["prompt"]},
        ],
        "answer": x["solution"], # Raw answer for verification
    }

dataset = dataset.map(process_data)
# Filter for length to avoid OOM
dataset = dataset.filter(lambda x: len(x["prompt"][1]["content"]) < 500)

## Reward Functions
GRPO uses reward functions to guide the model. We use two:
1. **Format Reward**: Checks if `<think>` and `</think>` tags exist.
2. **Correctness Reward**: Checks if the answer matches the solution.

In [None]:
import re

# 1. Format Reward: Reward for using <think> tags
def format_reward_func(completions, **kwargs):
    pattern = r"<think>.*?</think>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.search(pattern, r, re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

# 2. Correctness Reward: Check if the number in the response matches the solution
# This is a simplified regex matcher for numbers
def correctness_reward_func(prompts, completions, answer, **kwargs):
    responses = [completion[0]["content"] for completion in completions]
    
    # --- ADDED: Print Output for User ---
    print(f"\n\nGenerated Response:\n{responses[0]}\n" + "-" * 50)
    # ------------------------------------
    
    extracted_responses = []
    
    # Attempt to extract the last number or boxed answer
    for r in responses:
        # Look for content after </think>
        parts = r.split("</think>")
        if len(parts) > 1:
            final_part = parts[-1]
            # Extract last number found
            nums = re.findall(r"[-+]?\d*\.\d+|\d+", final_part)
            if nums:
                extracted_responses.append(nums[-1])
            else:
                extracted_responses.append(None)
        else:
            extracted_responses.append(None)

    scores = []
    for guess, true_ans in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0.0)
            continue
        
        # Very basic fuzzy match
        try:
            if abs(float(guess) - float(true_ans)) < 1e-6:
                scores.append(1.0)
            else:
                scores.append(0.0)
        except:
            scores.append(0.0)
    return scores

## Training Configuration

In [None]:
from trl import GRPOConfig, GRPOTrainer

training_args = GRPOConfig(
    output_dir = "grpo_output",
    run_name = "grpo_thinking_run",
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    num_generations = 4, # How many attempts per prompt (group size)
    max_prompt_length = 256,
    max_completion_length = 200, # Limit generation length for speed
    max_steps = 250, # Keep it short for demo
    save_steps = 50,
    max_grad_norm = 0.1,
    report_to = "none",
    use_vllm = True, # Use vLLM for fast generation
    vllm_gpu_memory_utilization = 0.3, # Reserved for vLLM
)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [format_reward_func, correctness_reward_func],
    args = training_args,
    train_dataset = dataset,
)

trainer.train()

## Inference Test

In [None]:
# Test the model
text = tokenizer.apply_chat_template([
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "What is 15 * 7?"}
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)

output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

print(output)