# Groupe Relative Policy Optimization (GRPO)

Install the Hugging Face libraries to run this notebook.

In [10]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

Your goal is to fill in the `GRPOTrainer` class. You have two options (and you can do both):
* the "normal GRPO" with clipped surrogate objective
* or the "vanilla GRPO" with original objective

In [30]:
questions = [
    "A train takes 3 hours to travel from A to B at an average speed of 60 km/h. How long would the trip take if the train traveled at 80 km/h?",
    "A snail climbs a 10-meter wall. It climbs 3 meters during the day and slips 2 meters at night. How many days will it take to reach the top?",
    'If a liar says, "I always lie," is he telling the truth?',
    'Can we say that "this sentence is false"? Explain why.',
    "Paul is twice the age Pierre was when Paul was the age Pierre is today. If Pierre is 20 years old, how old is Paul?",
    "A father and his son together are 36 years old. The father is exactly three times the son's age. How old is the son?",
    "All the cats I have met so far were black. Can I conclude that all cats are black? Why?",
    "If all humans are mortal and Socrates is human, what can we conclude?",
    "If a shirt costs twice as much as a pair of pants and the pants cost 30€, how much does the shirt cost?",
    'Jean says: "All my friends are football players." Pierre is Jean’s friend. Can we conclude that Pierre is a football player?',
    "You are in a train and must choose between switching the direction of the train to avoid five people tied to one track, but in doing so, you will kill one person on the other track. What do you do and why?",
    "A doctor has five patients in need of organ transplants, and a perfectly healthy patient comes in for a routine check-up. Should the doctor sacrifice this patient to save the five others?",
    "What would happen if gravity on Earth were twice as strong?",
    "If humans could read minds, how would that change society?"
]

dataset = []
for question in questions:
    dataset.append(f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> <\answer> tags, respectively, i.e., <think> reasoning process here <\think> <answer> answer here <\answer>. User: {question}. Assistant:")

In [103]:
class GRPOConfiguration:
    def __init__(self,
                 model_name="gpt2",
                 learning_rate=1e-5,
                 temperature=0.9,
                 max_prompt_length=200,
                 max_output_length=200,
                 device="cpu",
                 num_generations=3,
                 num_iterations=2,
                 beta=0.1,
                 epsilon=1e-5,
                 reward_func=None,
                 print_outputs=False,
                 print_advantages=False):
        
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.temperature = temperature
        self.max_prompt_length = max_prompt_length
        self.max_output_length = max_output_length
        self.device = device
        self.num_generations = num_generations # number of generations per prompts
        self.num_iterations = num_iterations    # number of iterative optimization steps per prompts
        self.beta = beta # KL coefficient
        self.epsilon = epsilon
        self.reward_func = reward_func # reward function
        self.print_outputs = print_outputs
        self.print_advantages = print_advantages


In [106]:
#Inspired from HuggingFace
import re
import copy


class GRPOTrainer:
    def __init__(self, config: GRPOConfiguration):
        """
        Initialize a GRPO Trainer
        Args:
            config: GRPO Configuration
        """
        self.device = config.device
        self.model_name = config.model_name
        self.model = AutoModelForCausalLM.from_pretrained(config.model_name).to(self.device) # model to optimize
        self.ref_model = None # reference model
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name, padding=True, padding_side="left")
        self.tokenizer.pad_token = self.tokenizer.eos_token  #gpt2 n'a pas de padding token
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.learning_rate)
        self.temperature = config.temperature
        self.max_prompt_length = config.max_prompt_length
        self.max_output_length = config.max_output_length
        self.eps = config.epsilon
        self.num_generations = config.num_generations # num of generation per prompts
        self.num_iterations = config.num_generations
        self.beta = config.beta
        self.reward_func = config.reward_func if config.reward_func else self._default_reward_func
        self.print_outputs = config.print_outputs
        self.print_adantages = config.print_advantages

    
    def _default_reward_func(self, prompt, outputs, **kwargs):
        """Reward function that checks if the completion has a specific format."""
        
        pattern = r"^<think>.*?</think><answer>.*?</answer>$"

        matches = [re.match(pattern, content) for content in outputs]
        return [1.0 if match else 0.0 for match in matches]
        

    def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep):
        """
        Get the per-token log propabilities for the outputs.
        Args:
            model: model to compute per-token log probabilities
            input_ids: sequence of tokens #(G, L)
            attention_mak: mapping of tokens to keep to compute attention (excluding padding)
            logits_to_keep: number of logits to keep to compute the loss
        """
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        logits = logits[:,:-1,:] # (G, L-1, V) exclude the last logit corresponding to next token prediction

        input_ids = input_ids[:, -logits_to_keep:]
        logits = logits[:, -logits_to_keep:]
        log_probs = logits.log_softmax(-1) # softmax to get probabilities
        return torch.gather(log_probs, dim=-1, index=input_ids.unsqueeze(-1)).squeeze(-1) # keeping only input_ids
    

    def _generate_and_score_outputs(self, prompt: str):
        """
        Generate answers with old_model to the prompt, compute associated reward and advantages and those of ref_model.
        Args:
            prompt: the prompt
        """
        device = self.device

        # Tokenization
        prompt_inputs = self.tokenizer(prompt, return_tensors="pt", padding=False).to(device)
        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]

        # Setting to prompt lenght
        if self.max_prompt_length is not None:
            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
            prompt_mask = prompt_mask[:, -self.max_prompt_length :]

        # Generate answers with old_model
        with torch.inference_mode():
            prompt_output_ids = self.model.generate(
                prompt_ids,
                attention_mask=prompt_mask,
                num_return_sequences=self.num_generations, # number of generations
                do_sample=True,
                max_length=prompt_ids.size(1) + self.max_output_length,  # (P+O)
                eos_token_id=self.tokenizer.eos_token_id
            )

        if self.print_outputs:
            for i in range(prompt_output_ids.size(0)):
                decoded_sequence = self.tokenizer.convert_ids_to_tokens(prompt_output_ids[i, :].tolist())
                print(f"Réponse générée (génération {i}) de len {len(decoded_sequence)}: {' '.join(decoded_sequence)}")

        # Splitting prompt and outputs tokens
        prompt_length = prompt_ids.size(1)
        prompt_ids = prompt_output_ids[:, :prompt_length]  # (G, P)
        output_ids = prompt_output_ids[:, prompt_length:]  # (G, O)


        # Padding with EOS after first EOS in outputs
        is_eos = output_ids == self.tokenizer.eos_token_id  # (G, O)
        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)  # (G,)
        has_eos = is_eos.any(dim=1)  # (G,)
        first_eos_idx = is_eos.int().argmax(dim=1)  # Index of first EOS token (G,)
        eos_idx[has_eos] = first_eos_idx[has_eos]  # (G,)
        sequence_indices = torch.arange(is_eos.size(1), device=device).expand_as(is_eos)  # (G, O)
        output_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()  # (G, O) Mask for padding

        # Merging prompt and output  mask
        attention_mask = torch.cat([prompt_mask.expand(self.num_generations, -1), output_mask], dim=1)  # (G, P+O)

        logits_to_keep = output_ids.size(1) # Logits to keep for loss computation

        # Compute log-probabilities for the prompt with model_ref and model_old
        self.model.eval()
        with torch.inference_mode():
            # When using num_iterations == 1, old_per_token_logps == per_token_logps
            if self.num_iterations > 1:
                old_per_token_logps = self._get_per_token_logps(self.model, prompt_output_ids, attention_mask, logits_to_keep)
            else:
                old_per_token_logps = None

            ref_per_token_logps = self._get_per_token_logps(self.ref_model, prompt_output_ids, attention_mask, logits_to_keep)
        
        # Decoding outputs
        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        # Compute rewards
        output_rewards = self.reward_func(prompt=prompt, outputs=outputs)
        rewards = torch.tensor(output_rewards, dtype=torch.float32, device=self.device) #(G,)

        # Compute advantages
        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
        std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)

        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4) # (G,)

        return {
            "prompt_ids": prompt_ids,
            "prompt_mask": prompt_mask,
            "output_ids": output_ids,
            "output_mask": output_mask,
            "old_per_token_logps": old_per_token_logps, 
            "ref_per_token_logps": ref_per_token_logps,
            "advantages": advantages,}


    def _prepare_inputs(self, inputs):
        return self._generate_and_score_outputs(inputs) # Without buffering
    

    def compute_loss(self, model, inputs):
        """
        Compute Loss according to GRPO paper, using advantages, per_token probabilities and KL divergence approximator to reference model
        Args:
            model: enlever
            inputs: {"prompt_ids", "prompt_mask", "output_ids", "output_mask", "old_per_token_logps", "ref_per_token_logps", "advantages"}
        """

        # Compute the per-token log probabilities for the current model
        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
        output_ids, output_mask = inputs["output_ids"], inputs["output_mask"]
        input_ids = torch.cat([prompt_ids, output_ids], dim=1)
        
        prompt_mask = prompt_mask.expand(output_mask.size(0), -1)
        attention_mask = torch.cat([prompt_mask, output_mask], dim=1)
        logits_to_keep = output_ids.size(1)  # we only need to compute the logits for the completion tokens

        per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep) # policy of current model

        # Compute the KL divergence between the current model and the reference model
        if self.beta != 0.0:
            ref_per_token_logps = inputs["ref_per_token_logps"]
            per_token_kl = (torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 ) # (G, O) KL differentiable approximator

        ### Compute the loss
        
        advantages = inputs["advantages"]
        if self.print_adantages:
            print( "advantages :", advantages)

        # When using num_iterations == 1, old_per_token_logps == per_token_logps
        old_per_token_logps = inputs["old_per_token_logps"] if self.num_iterations > 1 else per_token_logps.detach()
        policy_ratio = torch.exp(per_token_logps - old_per_token_logps) # (G, O)
        policy_ratio_clipped = torch.clamp(policy_ratio, 1 - self.eps, 1 + self.eps) #clipped ratio

        per_token_loss1 = policy_ratio * advantages.unsqueeze(1)
        per_token_loss2 = policy_ratio_clipped * advantages.unsqueeze(1)
        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)

        if self.beta != 0.0:
            per_token_loss = per_token_loss + self.beta * per_token_kl # adding KL

        loss = (per_token_loss * output_mask).sum() / output_mask.sum() # excluding padding in loss computation

        return loss
    

    def train(self, num_epochs: int, dataset: list):
        """
        Train the model on the dataset for specific number of epochs.
        Args:
            num_epochs: number of epochs 
            dataset: dataset
        """
        self.ref_model = copy.deepcopy(self.model)  # Initialize reference model

        for epoch in range(num_epochs):
            total_loss = 0
            for i, prompt in enumerate(dataset):
                print(f"Training on prompt {i}")
                loss = self.train_step(prompt)
                total_loss += loss
            
            avg_loss = total_loss / len(dataset)
            print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}") 
            self.ref_model = copy.deepcopy(self.model)


    def train_step(self, prompt: str):
        """
            Do a traing step on a specific prompt with multiple iteration steps.
            Args:
                prompt: prompt to optimize the model on it
        """
        
        self.model.train()

        # Generate outputs with old_model, compute log-probabilities for old_model and ref_model
        inputs = self._generate_and_score_outputs(prompt)

        # For self.num_iterations, iterativelt update the model on the same prompt
        for i in range(1, self.num_iterations): 
            
            print(f"GRPO Iteration {i}")

            loss = self.compute_loss(self.model, inputs)

            # Backpropagation
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        return loss.item()


config = GRPOConfiguration(num_generations=3, num_iterations=2, print_outputs=False, print_advantages=True)
trainer = GRPOTrainer(config)
trainer.train(1, dataset)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Training on prompt 0
GRPO Iteration 1
advantages : tensor([0., 0., 0.])


KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

        # On suppose que prompt_completion_ids a la forme (batch_size, num_generations, sequence_length)
        for i in range(batch_size):
            for j in range(self.num_generations):
                # Décoder chaque séquence de tokens (prompt + réponse)
                decoded_sequence = self.tokenizer.convert_ids_to_tokens(prompt_completion_ids[i, j, :].tolist())
                print(f"Réponse générée (batch {i}, génération {j}) de len {len(decoded_sequence)}: {' '.join(decoded_sequence)}")

# Charger le tokenizer GPT-2 avec padding à gauche
tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")

# Ajouter un vrai token de padding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Charger le modèle et adapter les embeddings (important pour qu'il reconnaisse [PAD])
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Exemple de phrases
sentences = [
    "Bonjour, comment ça va ?",
    "Je vais bien.",
    "Quel est ton modèle de langage préféré ?"
]

# Tokenization avec padding
encoded = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Affichage
print("\nTokens après padding :")
for i, sent in enumerate(sentences):
    print(f"{i}: {tokenizer.convert_ids_to_tokens(encoded['input_ids'][i])}")



Tokens après padding :
0: ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 'Bon', 'j', 'our', ',', 'Ġcomment', 'ĠÃ', '§', 'a', 'Ġva', 'Ġ?']
1: ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 'Je', 'Ġv', 'ais', 'Ġb', 'ien', '.']
2: ['Q', 'uel', 'Ġest', 'Ġton', 'Ġmod', 'Ã¨', 'le', 'Ġde', 'Ġlang', 'age', 'Ġpr', 'Ã©', 'f', 'Ã©', 'rÃ©', 'Ġ?']


In [48]:
from transformers import AutoTokenizer
import torch

# Charger le tokenizer GPT-2 avec padding à gauche
tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 n'a pas de token PAD par défaut
# Ajouter un vrai token de padding
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})


# Charger le modèle et adapter les embeddings (important pour qu'il reconnaisse [PAD])
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Exemple de phrases
sentences = [
    "Bonjour, comment ça va ?" + tokenizer.eos_token + "blablaba",  # Phrase normale
    "Je vais bien.",  # Phrase plus courte
    "Quel est ton modèle de langage préféré ?"  # Phrase plus longue
]

# Tokenization avec padding (longueur max auto)
encoded = tokenizer(sentences, padding=True, padding_side="left", truncation=True, return_tensors="pt")

# Récupérer les tokens
completion_ids = encoded["input_ids"]
attention_mask = encoded["attention_mask"]
print("masques d'attention : ", attention_mask)

# Affichage
print("\nTokens (avant padding après EOS) :")
for i, sent in enumerate(sentences):
    print(f"{i}: {tokenizer.convert_ids_to_tokens(completion_ids[i])}")

# Détection des tokens EOS
eos_token_id = tokenizer.eos_token_id
is_eos = completion_ids == eos_token_id  

# Initialisation des indices EOS
eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=completion_ids.device)

# Trouver les séquences contenant au moins un <EOS>
has_eos = is_eos.any(dim=1)

# Trouver le premier <EOS>
first_eos_idx = is_eos.int().argmax(dim=1, keepdim=False)  

# Appliquer seulement aux séquences avec EOS
if has_eos.any():
    eos_idx[has_eos] = first_eos_idx[has_eos.nonzero(as_tuple=True)[0]]

# Créer le masque
sequence_indices = torch.arange(completion_ids.size(1), device=completion_ids.device).expand(completion_ids.size(0), -1)
completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()

# Appliquer le padding après EOS
padded_sequences = completion_ids * completion_mask + (1 - completion_mask) * eos_token_id

# Affichage final
print("\nTokens (après padding après EOS) :")
for i in range(len(sentences)):
    print(f"{i}: {tokenizer.convert_ids_to_tokens(padded_sequences[i])}")


masques d'attention :  tensor([[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

Tokens (avant padding après EOS) :
0: ['<|endoftext|>', '<|endoftext|>', 'Bon', 'j', 'our', ',', 'Ġcomment', 'ĠÃ', '§', 'a', 'Ġva', 'Ġ?', '<|endoftext|>', 'bl', 'abl', 'aba']
1: ['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', 'Je', 'Ġv', 'ais', 'Ġb', 'ien', '.']
2: ['Q', 'uel', 'Ġest', 'Ġton', 'Ġmod', 'Ã¨', 'le', 'Ġde', 'Ġlang', 'age', 'Ġpr', 'Ã©', 'f', 'Ã©', 'rÃ©', 'Ġ?']

Tokens (après padding après EOS) :
0: ['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endof

In [None]:
config = GRPOConfiguration()
trainer = GRPOTrainer(config)
trainer.train(1, dataset, 2)

In [28]:
config = GRPOConfiguration()
trainer = GRPOTrainer(config)
trainer.train(1, dataset, 2)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: too many indices for tensor of dimension 1

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(["Hello, how are you?", "Fine, thanks!"], return_tensors="pt", padding=True)

print(inputs)
print(tokenizer.eos_token_id)


{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102],
        [ 101, 2986, 1010, 4283,  999,  102,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])}
None


In [None]:
class GRPOTrainer:
    def __init__(self, 
                 model,
                 tokenizer,
                 learning_rate = 1e-5, 
                 temperature = 1.0, 
                 max_length = 100, 
                 device = "cpu"):
        self.llm = model.to(device)
        self.tokenizer = tokenizer
        self.optimizer = torch.optim.AdamW(self.llm.parameters(), lr=learning_rate)
        self.device = device
        self.temperature = temperature
        self.max_length = max_length

    def generate(self, prompt):
        input = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        output = None
        loss = None

        text = self.tokenizer.decode(output[0])
        return loss, text

    def calculate_reward(self, output):
        """
            Calcule the reward of a single output
        """
        pass

    def calculate_GRPO_advantages(self, outputs):
        """
            Calculate the advantages of each output
        """
        pass 

    def train_step(self, prompt):
        """
            A training step on a single prompt
        """
        pass

In [10]:
model_name = "gpt2"
#model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, padding_side="left")
print(tokenizer.eos_token_id)

50256


In [None]:
trainer = GRPOTrainer(model, tokenizer)
prompts = ["The best way to learn coding is", "The future of AI is"]

for epoch in range(3): # Train for a few epochs
    loss = 0
    for prompt in prompts:
        loss += trainer.train_step(prompts)        
    print(f"Epoch {epoch+1}, Loss: {loss / len(prompts)}")

In [None]:
trainer.generate_text(prompts)