<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/DPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# DPO 原理 https://blog.csdn.net/qq_27590277/article/details/142383849
# chat-GPT: https://chatgpt.com/c/66f04b1b-14cc-800e-931e-0e333c2418f8

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import time

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# The error occurs because the GPT-2 tokenizer does not have a default padding token defined. To resolve this, you need to explicitly assign a pad_token.
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Freeze GPT-2 parameters (optional, for efficiency)
for param in model.transformer.parameters():
    param.requires_grad = False

class PreferenceModel(nn.Module):
    def __init__(self, base_model):
        super(PreferenceModel, self).__init__()
        self.base_model = base_model
        self.scoring_head = nn.Linear(self.base_model.config.hidden_size, 1)  # Output a score for preferences

    def forward(self, input_ids, attention_mask=None):
        # Enable hidden states output
        outputs = self.base_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

        # Get the last hidden state (second-to-last layer hidden states)
        last_hidden_state = outputs.hidden_states[-1]
        cls_embedding = last_hidden_state[:, 0, :]  # Use the [CLS] token embedding (first token)

        # Pass through the scoring head
        score = self.scoring_head(cls_embedding)
        return score


# Wrap GPT-2 with the preference model
preference_model = PreferenceModel(model)

# Define DPO loss function
def dpo_loss(preferred_score, non_preferred_score):
    return -torch.log(torch.sigmoid(preferred_score - non_preferred_score)).mean()

# Create synthetic data for demonstration
class PreferenceDataset(Dataset):
    def __init__(self, tokenizer, prompts, preferred_responses, non_preferred_responses):
        self.tokenizer = tokenizer
        self.prompts = prompts
        self.preferred_responses = preferred_responses
        self.non_preferred_responses = non_preferred_responses

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        preferred = self.preferred_responses[idx]
        non_preferred = self.non_preferred_responses[idx]

        preferred_input = self.tokenizer(
            prompt + preferred,
            return_tensors="pt",
            padding="max_length",
            max_length=50,
            truncation=True,
        )
        non_preferred_input = self.tokenizer(
            prompt + non_preferred,
            return_tensors="pt",
            padding="max_length",
            max_length=50,
            truncation=True,
        )

        return (
            preferred_input["input_ids"].squeeze(0),
            preferred_input["attention_mask"].squeeze(0),
            non_preferred_input["input_ids"].squeeze(0),
            non_preferred_input["attention_mask"].squeeze(0),
        )

# Example synthetic dataset
prompts = ["What is the capital of France?", "Tell me a joke.", "Explain gravity."]
preferred_responses = [" The capital of France is Paris.", " Why don't skeletons fight? They don't have the guts.", " Gravity is the force that attracts two bodies toward each other."]
non_preferred_responses = [" The capital is not available.", " I'm bad at jokes.", " It's magic."]

dataset = PreferenceDataset(tokenizer, prompts, preferred_responses, non_preferred_responses)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Optimizer
optimizer = AdamW(preference_model.parameters(), lr=5e-5)

# Training loop
for epoch in range(5):
    preference_model.train()
    total_loss = 0
    start_time = time.time()
    for batch in dataloader:
        preferred_input_ids, preferred_attention_mask, non_preferred_input_ids, non_preferred_attention_mask = [
            b for b in batch
        ]

        # Forward pass
        preferred_score = preference_model(preferred_input_ids, preferred_attention_mask)
        non_preferred_score = preference_model(non_preferred_input_ids, non_preferred_attention_mask)

        # Compute DPO loss
        loss = dpo_loss(preferred_score, non_preferred_score)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    t = round( time.time() - start_time)
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}, takes {t} seconds")

Epoch 1, Loss: 1.0369, takes 3 seconds
Epoch 2, Loss: 10.7698, takes 3 seconds
Epoch 3, Loss: 1.7340, takes 5 seconds
Epoch 4, Loss: 4.8445, takes 2 seconds
Epoch 5, Loss: 12.2094, takes 2 seconds
