# Groupe Relative Policy Optimization (GRPO)

Install the Hugging Face libraries to run this notebook.

In [1]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ---------------------------------- ----- 8.4/9.7 MB 39.8 MB/s eta 0:00:01
   ---------------------------------------- 9.7/9.7 MB 37.6 MB/s eta 0:00:00
Downloa


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

Your goal is to fill in the `GRPOTrainer` class. You have two options (and you can do both):
* the "normal GRPO" with clipped surrogate objective
* or the "vanilla GRPO" with original objective

In [None]:
import torch
import torch.nn.functional as F

class GRPOTrainer:
    def __init__(self, 
                 model,
                 tokenizer,
                 learning_rate=1e-5, 
                 temperature=1.0, 
                 max_length=100, 
                 device="cpu",
                 clip_epsilon=0.2,  # Seuil pour le clipping dans "normal GRPO"
                 use_clipped=True):  # Active Clipped GRPO si True, sinon Vanilla GRPO
        self.llm = model.to(device)
        self.tokenizer = tokenizer
        self.optimizer = torch.optim.AdamW(self.llm.parameters(), lr=learning_rate)
        self.device = device
        self.temperature = temperature
        self.max_length = max_length
        self.clip_epsilon = clip_epsilon
        self.use_clipped = use_clipped  # Toggle entre normal et vanilla GRPO

    def generate(self, prompt):
        """
        Generate text from a prompt using the LLM.
        """
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)

        with torch.no_grad():
            output = self.llm.generate(
                input_ids,
                max_length=self.max_length,
                temperature=self.temperature,
                top_k=50,
                do_sample=True
            )
        
        text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return text

    def calculate_reward(self, text):
        """
        Calculate reward using inverse perplexity.
        """
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)

        with torch.no_grad():
            outputs = self.llm(input_ids, labels=input_ids)
            loss = outputs.loss  # NLL Loss (Negative Log-Likelihood)
        
        reward = -loss.item()  # Reward = -Perplexity (minimiser la perplexité)
        return reward

    def calculate_GRPO_advantages(self, rewards):
        """
        Compute the normalized advantage for GRPO.
        """
        rewards = torch.tensor(rewards, dtype=torch.float32, device=self.device)
        mean_reward = rewards.mean()
        std_reward = rewards.std() + 1e-8  # Avoid division by zero
        advantages = (rewards - mean_reward) / std_reward
        return advantages

    def train_step(self, prompt):
        """
        Perform one training step on a single prompt.
        """
        # Générer des sorties
        generated_text = self.generate(prompt)

        # Calculer la récompense
        reward = self.calculate_reward(generated_text)

        # Calculer l'avantage GRPO
        advantages = self.calculate_GRPO_advantages([reward])

        # Encoder l'entrée et la sortie en tokens
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
        output_ids = self.tokenizer(generated_text, return_tensors="pt").input_ids.to(self.device)

        # Calculer la probabilité de la politique


In [None]:
class GRPOTrainer:
    def __init__(self, 
                 model,
                 tokenizer,
                 learning_rate = 1e-5, 
                 temperature = 1.0, 
                 max_length = 100, 
                 device = "cpu"):
        self.llm = model.to(device)
        self.tokenizer = tokenizer
        self.optimizer = torch.optim.AdamW(self.llm.parameters(), lr=learning_rate)
        self.device = device
        self.temperature = temperature
        self.max_length = max_length

    def generate(self, prompt):
        input = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        output = None
        loss = None

        text = self.tokenizer.decode(output[0])
        return loss, text

    def calculate_reward(self, output):
        """
            Calcule the reward of a single output
        """
        pass

    def calculate_GRPO_advantages(self, outputs):
        """
            Calculate the advantages of each output
        """
        pass 

    def train_step(self, prompt):
        """
            A training step on a single prompt
        """
        pass

In [None]:
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
trainer = GRPOTrainer(model, tokenizer)
prompts = ["The best way to learn coding is", "The future of AI is"]

for epoch in range(3): # Train for a few epochs
    loss = 0
    for prompt in prompts:
        loss += trainer.train_step(prompts)        
    print(f"Epoch {epoch+1}, Loss: {loss / len(prompts)}")

In [None]:
trainer.generate_text(prompts)