<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import numpy as np
from torch.nn.utils.rnn import pad_sequence

# Set up the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Define a pad token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define a policy model (identical to GPT-2 for PPO)
class PolicyModel(nn.Module):
    def __init__(self, base_model):
        super(PolicyModel, self).__init__()
        self.base_model = base_model

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask, return_dict=True)
        logits = outputs.logits  # Model logits (next-token predictions)
        return logits

policy_model = PolicyModel(model)
optimizer = AdamW(policy_model.parameters(), lr=5e-5)

# Define synthetic reward function
def compute_rewards(preferences, predictions):
    """
    Compute synthetic rewards for PPO. For simplicity, this example assigns higher rewards
    to predictions matching the preferred outputs.
    """
    rewards = []
    for pref, pred in zip(preferences, predictions):
        # Reward is higher for matching preferred responses
        reward = 1.0 if pref.strip() in pred.strip() else -1.0
        rewards.append(reward)
    return torch.tensor(rewards)

# Define PPO loss function
def ppo_loss(old_logits, new_logits, actions, advantages, epsilon=0.2):
    """
    PPO loss with clipped surrogate objective.
    - old_logits: logits from the previous policy.
    - new_logits: logits from the updated policy.
    - actions: actions (token indices) taken.
    - advantages: computed advantages (rewards - baseline).
    - epsilon: PPO clipping threshold.
    """
    probs = torch.softmax(new_logits, dim=-1)
    old_probs = torch.softmax(old_logits, dim=-1)

    action_probs = probs.gather(-1, actions.unsqueeze(-1)).squeeze(-1)
    # action_probs = probs.gather(-1, actions.unsqueeze(-1).expand(-1, probs.size(-1))).squeeze(-1)
    old_action_probs = old_probs.gather(-1, actions.unsqueeze(-1)).squeeze(-1)

    ratio = action_probs / old_action_probs
    clipped_ratio = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)

    loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
    return loss

# Synthetic dataset
prompts = ["What is the capital of France?", "Tell me a joke.", "Explain gravity."]
preferred_responses = ["The capital of France is Paris.", "Why don't skeletons fight? They don't have the guts.", "Gravity is the force of attraction."]

class PPOEnvironment:
    def __init__(self, tokenizer, prompts, preferred_responses):
        self.tokenizer = tokenizer
        self.prompts = prompts
        self.preferred_responses = preferred_responses

    def step(self, model, prompt, preferred_response):
        # Generate prediction
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
        with torch.no_grad():
            logits = model(input_ids).squeeze(0)
        predicted_tokens = torch.argmax(logits, dim=-1)
        predicted_text = self.tokenizer.decode(predicted_tokens)

        # Compute reward
        reward = 1.0 if preferred_response.strip() in predicted_text.strip() else -1.0
        return predicted_text, reward

environment = PPOEnvironment(tokenizer, prompts, preferred_responses)

# PPO Training Loop
epochs = 5
batch_size = 2
for epoch in range(epochs):
    total_loss = 0
    for batch_idx in range(0, len(prompts), batch_size):
        # Prepare batch data
        batch_prompts = prompts[batch_idx:batch_idx + batch_size]
        batch_responses = preferred_responses[batch_idx:batch_idx + batch_size]

        # Collect old logits and actions
        old_logits = []
        actions = []
        rewards = []
        for prompt, response in zip(batch_prompts, batch_responses):
            input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids
            with torch.no_grad():
                logits = policy_model(input_ids).squeeze(0)
            predicted_tokens = torch.argmax(logits, dim=-1)
            predicted_text = tokenizer.decode(predicted_tokens)

            reward = compute_rewards([response], [predicted_text])
            rewards.append(reward.item())
            old_logits.append(logits)
            actions.append(predicted_tokens[-1].item())  # Example: last token as action

        # Convert data to tensors
        # old_logits = torch.stack(old_logits)
        # Pad logits tensors to the same sequence length and then stack them
        old_logits = pad_sequence(old_logits, batch_first=True)

        actions = torch.tensor(actions)
        rewards = torch.tensor(rewards)

        # Compute advantages (reward - baseline)
        baseline = rewards.mean()  # Simple baseline
        advantages = rewards - baseline

        # Forward pass with new policy
        optimizer.zero_grad()
        for prompt in batch_prompts:
            input_ids = tokenizer(prompt, return_tensors="pt").input_ids
            new_logits = policy_model(input_ids)

        # Compute PPO loss
        loss = ppo_loss(old_logits, new_logits, actions, advantages)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



RuntimeError: Index tensor must have the same number of dimensions as input tensor