In [None]:
%%capture
# Skip restarting message in Colab
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

!pip install unsloth vllm
!pip install --upgrade pillow
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b

### Unsloth

Use `PatchFastRL` before all functions to patch GRPO and other RL algorithms!

In [None]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-13 05:18:07 __init__.py:190] Automatically detected platform cuda.


### Load the model

In [None]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-bnb-4bit with actual GPU utilization = 59.59%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.61 GB. Also swap space = 2 GB.
INFO 02-13 05:18:58 config.py:542] This model supports multiple tasks: {'classify', 'embed', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

INFO 02-13 05:19:02 cuda.py:179] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 02-13 05:19:02 cuda.py:227] Using XFormers backend.
INFO 02-13 05:19:02 model_runner.py:1110] Starting to load model unsloth/meta-llama-3.1-8b-instruct-bnb-4bit...
INFO 02-13 05:19:02 loader.py:1102] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 02-13 05:19:03 weight_utils.py:252] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-13 05:20:27 model_runner.py:1115] Loading model weights took 5.3541 GB
INFO 02-13 05:20:27 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-13 05:20:39 worker.py:267] Memory profiling takes 11.26 seconds
INFO 02-13 05:20:39 worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.60) = 8.78GiB
INFO 02-13 05:20:39 worker.py:267] model weights take 5.35GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 0.74GiB; the rest of the memory reserved for KV Cache is 2.64GiB.
INFO 02-13 05:20:39 executor_base.py:110] # CUDA blocks: 1353, # CPU blocks: 1024
INFO 02-13 05:20:39 executor_base.py:115] Maximum concurrency for 512 tokens per request: 42.28x
INFO 02-13 05:20:41 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occ

Capturing CUDA graph shapes: 100%|██████████| 23/23 [00:40<00:00,  1.76s/it]

INFO 02-13 05:21:22 model_runner.py:1562] Graph capturing finished in 40 secs, took 0.58 GiB
INFO 02-13 05:21:22 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 54.41 seconds





tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Unsloth 2025.2.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Dataset preparation

In [None]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

### Reward functions

In [None]:
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

### Train the model

In [None]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 200,
    save_steps = 200,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!

You might have to wait 150 to 200 steps for any action. You'll probably get 0 reward for the first 100 steps. Please be patient!

In [None]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 200
 "-____-"     Number of trainable parameters = 83,886,080


-------------------- Question:
Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers? 
Answer:
100 
Response:
</reasoning>Assume Ahmed's grade in the remaining 9 assignments is x. Emily's total grade in the first 9 assignments is 92. We can represent this as 92 + the grade on the final assignment. Since the final assignment is worth the same as the other assignments, we can say the grade on the final assignment is 0.9x, since it is 90% of the total of the other assignments. Now, we can write the equation for the total grades as follows: 

92 + 0.9x = 1x + 90 

Subtracting 92 from both sides gives us 0.9x = x - 2. 
Subtracting 0.9x from both sides gives us 0 = 0.1x - 2. Subse

Step,Training Loss,reward,reward_std,completion_length,kl
1,0.0,0.0,0.0,196.5,0.0
2,0.0,0.040667,0.099613,183.5,0.0
3,0.0,-0.0425,0.070648,131.166672,5e-06
4,0.0,0.780667,1.213525,188.166672,7e-06
5,0.0,0.020833,0.051031,195.0,7e-06
6,0.0,-0.097333,0.159873,144.333344,7e-06
7,0.0,-0.033667,0.10937,117.666672,8e-06
8,0.0,1.158667,1.224857,163.833344,6e-06
9,0.0,0.0,0.0,147.166672,7e-06
10,0.0,0.406833,0.996534,112.666672,1.6e-05


-------------------- Question:
The gauge on a water tank shows that the tank is 1/3 full of water. To fill the tank, 16 gallons of water are added. How many gallons of water does the tank hold when full? 
Answer:
24 
Response:
Let's break this problem down. 

If the tank is 1/3 full and 16 gallons are added to fill it, then the amount of water added is equal to the amount that was already in the tank (1/3 of the total capacity) plus the amount needed to fill it (2/3 of the total capacity). 

Let x be the capacity of the tank in gallons. 

The equation becomes x - (1/3)x + 16 = x.

To simplify the equation, we can combine the x terms: (2/3)x + 16 = x.

Next, subtract (2/3)x from both sides of the equation: (1/3)x = 16.

Now, multiply both sides of the equation by 3 to get rid of the fraction: x = 16 * 3

x = 48

The tank holds 48 gallons of water when full. 
Extracted:
Let's break this problem down. 

If the tank is 1/3 full and 16 gallons are added to fill it, then the amount of water 

Step,Training Loss,reward,reward_std,completion_length,kl
1,0.0,0.0,0.0,196.5,0.0
2,0.0,0.040667,0.099613,183.5,0.0
3,0.0,-0.0425,0.070648,131.166672,5e-06
4,0.0,0.780667,1.213525,188.166672,7e-06
5,0.0,0.020833,0.051031,195.0,7e-06
6,0.0,-0.097333,0.159873,144.333344,7e-06
7,0.0,-0.033667,0.10937,117.666672,8e-06
8,0.0,1.158667,1.224857,163.833344,6e-06
9,0.0,0.0,0.0,147.166672,7e-06
10,0.0,0.406833,0.996534,112.666672,1.6e-05


-------------------- Question:
Carolyn buys a silverware set with 6 knives, 12 forks, and three times as many spoons as knives. Then her friend trades her 10 knives for 6 spoons. What percentage of Carolyn's silverware is knives? 
Answer:
40 
Response:
Let's first determine the initial number of spoons Carolyn has. Since Carolyn has three times as many spoons as knives, and she has 6 knives, she has 6 * 3 = 18 spoons.

Initially, she has a total of 6 knives, 12 forks, and 18 spoons.

After the trade, she loses 4 knives and gains 6 spoons. Now, Carolyn has 2 knives, 12 forks, and 24 spoons.

The total number of pieces of silverware she has is 2 + 12 + 24 = 38.

The percentage of silverware that is knives is (number of knives / total number of pieces) * 100 = (2 / 38) * 100 = 5.26% (rounded to two decimal places).

<answer>
5.26%
</answer> 
Extracted:
5.26%
-------------------- Question:
James runs a TV show and there are 5 main characters and 4 minor characters.  He pays the minor chara

TrainOutput(global_step=200, training_loss=0.0010307767488005838, metrics={'train_runtime': 9303.809, 'train_samples_per_second': 0.021, 'train_steps_per_second': 0.021, 'total_flos': 0.0, 'train_loss': 0.0010307767488005838})

### Inference

without GRPO Training

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "Calculate pi."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:25<00:00, 25.45s/it, est. speed input: 1.53 toks/s, output: 18.62 toks/s]


"Calculating pi is a complex task that has been the subject of much mathematical study and experimentation over the centuries. Here are a few ways to calculate pi:\n\n**Method 1: Archimedes' Method (circa 250 BCE)**\n\nArchimedes approximated pi by inscribing and circumscribing polygons around a circle and calculating their perimeters. For example:\n\n* For a hexagon (6-sided polygon), the perimeter is 6 × (side length) = 6 × 2 × (radius) × (tan(π/6)) = 6 × 2 × (radius) × (sqrt(3) / 3)\n* The ratio of the perimeter of the hexagon to the diameter is (6 × 2 × (sqrt(3) / 3)) / (2 × radius) = (6 × sqrt(3) / 3) / (radius)\n* As the number of sides increases, the ratio approaches pi\n\nUsing this method, Archimedes approximated pi as being between 3 10/71 and 3 1/7.\n\n**Method 2: Leibniz Formula (1671)**\n\nGottfried Wilhelm Leibniz found a series expansion for pi:\n\nπ/4 = 1 - 1/3 + 1/5 - 1/7 + 1/9 - ...\n\nThis series is known as the Leibniz formula for pi.\n\n**Method 3: Taylor Series (1

In [None]:
model.save_lora("llama-grpo_saved_lora")

Now we load the LoRA and test:

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "Calculate pi."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("llama-grpo_saved_lora"),
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:25<00:00, 25.82s/it, est. speed input: 2.36 toks/s, output: 17.51 toks/s]


"<reasoning>\n\nTo calculate pi (π), we can use the formula for the area of a circle, A = πr^2, where A is the area and r is the radius of the circle. However, since we don't know the exact value of pi, we can use an approximation method.\n\nOne way to approximate pi is by using the Taylor series expansion of the arctangent function. The series is given by:\n\narctan(x) = x - x^3/3 + x^5/5 - x^7/7 + ...\n\nWe can rearrange this series to solve for pi by using the fact that arctan(1) = π/4.\n\nπ/4 = arctan(1) = 1 - 1/3 + 1/5 - 1/7 + ...\n\nMultiplying both sides by 4, we get:\n\nπ = 4 * (1 - 1/3 + 1/5 - 1/7 + ...)\n\nThis series is an alternating series, which means that it converges to a finite limit. We can approximate pi by summing up the terms of the series.\n\nHowever, this is not a practical way to calculate pi. A more practical method is to use the Monte Carlo method, which involves generating random points inside and outside a circle and calculating the ratio of points inside th

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "At the Bertolli Farm, they grow 2073 tomatoes, 4112 cobs of corn, and 985 onions. How many fewer onions are grown than tomatoes and corn together? "},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("llama-grpo_saved_lora"),
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it, est. speed input: 14.46 toks/s, output: 16.55 toks/s]


'<reasoning>\nTo find the difference, we first need to calculate the total number of tomatoes and corn. We add the number of tomatoes and the number of corn: 2073 (tomatoes) + 4112 (corn) = 6185. \nNow, we need to find the difference between this total and the number of onions. To do that, we subtract the number of onions from the total: 6185 - 985 = 6200. \n</reasoning>\n<answer>\n6200\n</answer>'

Reasoning works better, but calculation is wrong, 6185 - 985 = 6200

Final answer 6185 - 985 = 5200

### Push to Hub

In [None]:
from google.colab import userdata
from huggingface_hub import login, HfApi, create_repo

In [None]:
login(token=userdata.get('niru_hf_write'))

In [None]:
create_repo("nirusanan/GRPO-llama3.1-reasoning", private=True)

RepoUrl('https://huggingface.co/nirusanan/GRPO-llama3.1-reasoning', endpoint='https://huggingface.co', repo_type='model', repo_id='nirusanan/GRPO-llama3.1-reasoning')

In [None]:
api = HfApi()

In [None]:
api.upload_folder(
    folder_path="/content/llama-grpo_saved_lora",
    repo_id="nirusanan/GRPO-llama3.1-reasoning",
)