In [None]:
!pip install -q vllm

In [None]:
!pip install -q trl datasets peft

In [None]:
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer

# Load and prep dataset

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

INFO 02-16 11:38:26 __init__.py:190] Automatically detected platform cuda.


import the gsm8k dataset and restructure it to fit into a conversational prompt format

In [None]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

# Reward Functions

`correctness_reward_func`: Compares extracted answers from model completions with the correct answer and assigns a reward of 2.0 for a match, otherwise 0.0.

`int_reward_func`: Checks if the extracted response is a digit and assigns a reward of 0.5 if true, otherwise 0.0.

`strict_format_reward_func`: Verifies if the response strictly follows a predefined XML format and assigns 0.5 if it matches, otherwise 0.0.

`soft_format_reward_func`: Loosely checks if the response contains <reasoning>...</reasoning> and <answer>...</answer> and assigns 0.5 if found, otherwise 0.0.

`count_xml`: Evaluates how well the response follows XML formatting by checking occurrences of specific XML tags and reducing the score if extra text appears after </answer>.

`xmlcount_reward_func`: Applies count_xml to each response in the completions and returns a list of XML format scores.

In [None]:
# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

# Model

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct" #"HuggingFaceTB/SmolLM2-135M-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map=None
).to("cuda")


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'k_proj', 'v_proj'] # Choose all linear layers from the model
)

model = get_peft_model(model, peft_config)

# Training arguments

In [None]:
# output_dir="outputs/SmolLM2-135M-GRPO"
# run_name="HuggingFaceTB/SmolLM2-135M-Instruct-GRPO-gsm8k"

output_dir="outputs/Qwen2.5-0.5B-GRPO"
run_name="Qwen/Qwen2.5-0.5B-GRPO-gsm8k"

In [None]:
training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    optim="adamw_torch_fused",
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.2, # increased L2 weight regularisation - because
    warmup_ratio = 0.1,
    lr_scheduler_type='cosine',
    logging_steps=1,
    fp16=True,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=2,
    num_generations=6, # Decrease if out of memory
    max_prompt_length=128,
    max_completion_length=100,
    # num_train_epochs=1,
    max_steps = 150,
    save_steps = 150,
    max_grad_norm=0, # disables gradient clipping
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=.3,
    vllm_device="cuda:0",
    report_to="tensorboard",
)

# Train

In [None]:
import os
os.environ["VLLM_DTYPE"] = "float16" ###

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func],
    args=training_args,
    train_dataset=dataset,
)
trainer.train()

# Inference

First, let's try the model without any GRPO trained

In [None]:
text = tokenizer.apply_chat_template([
    {"role": "user", "content": "Calculate pi."},
], tokenize=False, add_generation_prompt=True)

inputs = tokenizer(text, return_tensors="pt").to(model.device)

output_ids = model.generate(
    **inputs,
    max_length=1024,
    temperature=0.8,
    top_p=0.95,
)

output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Calculate pi.
assistant
As an AI language model, I can certainly provide information on how to calculate pi (π) in various programming languages and software packages.

In Python, you can use the `math` module's `pi()` function or directly import it from the `cmath` module if you prefer using the mathematical constant π:

```python
import math

# Using math.pi()
print(math.pi)

# Using cmath.sqrt(2)
print(cmath.sqrt(2))
```

This will output the value of π as approximately 3.14159.

However, it's important to note that computing π is not feasible with any kind of standard computation hardware today because π involves irrational numbers that cannot be represented exactly in binary or decimal form, even with high precision computing capabilities. Pi has been calculated up to a few digits past its decimal point, but more advanced calculations would require very large numbers.

If you want to compute π numeric

Now let's try the model we just trained!

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "/content/outputs/Qwen2.5-0.5B-GRPO/checkpoint-150",
    torch_dtype=torch.float16,
    device_map=None
).to("cuda")

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role": "user", "content": "Calculate pi."},
], tokenize=False, add_generation_prompt=True)

inputs = tokenizer(text, return_tensors="pt").to(model.device)

output_ids = model.generate(
    **inputs,
    max_length=1024,
    temperature=0.8,
    top_p=0.95,
)

output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output)

system

Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>

user
Calculate pi.
assistant
To calculate \(\pi\), which is approximately equal to \(3.141592653589793\), you can use a calculator or an online tool that performs mathematical calculations. However, I can also provide you with a detailed explanation of how to approximate \(\pi\) using the Monte Carlo method.

### Explanation of the Monte Carlo Method:

The Monte Carlo method involves generating random points within a unit square and determining how many of them fall inside a circle of radius 1 centered at the origin. The ratio of these points to the total number generated approximates the area of the circle divided by the area of the square. By scaling this ratio appropriately, we obtain an estimate for \(\pi\).

Here’s how it works in more detail:

1. **Square Creation**: Create a square with side length 1 (the diameter of the circle). Label one corner as the origin.
   
2. **Random Points Ge