## Toy sample prep

In [5]:
# Supervised Fine-tuning data (prompt-response pairs)
sft_data = [
    {"prompt": "Explain rain to a child.", "response": "Rain is water falling from the clouds when they get too heavy."},
    {"prompt": "Define happiness simply.", "response": "Happiness is feeling joyful and good inside."},
]

# Comparison data for reward model (prompt, preferred_response, other_response)
rm_data = [
    {
        "prompt": "Explain rain to a child.",
        "preferred": "Rain is water falling from the clouds when they get too heavy.",
        "other": "Rain happens when water evaporates from the ground."
    },
]

# New prompt for PPO stage
ppo_prompt = "What is sunshine?"


## 1. Supervised Fine-Tuning (SFT)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Prepare SFT data
def tokenize_fn(example):
    prompt = example['prompt']
    response = example['response']
    text = prompt + " " + response + tokenizer.eos_token
    return tokenizer(text, truncation=True, padding="max_length", max_length=50)

tokenized_sft_data = [tokenize_fn(d) for d in sft_data]

class SFTDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data[idx]['input_ids'])
        labels = input_ids.clone()
        return {'input_ids': input_ids, 'labels': labels}

train_dataset = SFTDataset(tokenized_sft_data)

# SFT training
training_args = TrainingArguments(
    output_dir='./results', num_train_epochs=3, per_device_train_batch_size=2,
    learning_rate=5e-5, logging_steps=1
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
trainer.train()

## 2. Reward Model

In [None]:

from transformers import GPT2ForSequenceClassification

# Reward model initialized from GPT-2
reward_model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=1)

# Prepare RM data (pairwise preference)
def tokenize_rm(example):
    prompt = example['prompt']
    preferred = prompt + " " + example['preferred']
    other = prompt + " " + example['other']
    return {
        'preferred': tokenizer(preferred, truncation=True, padding='max_length', max_length=50, return_tensors='pt'),
        'other': tokenizer(other, truncation=True, padding='max_length', max_length=50, return_tensors='pt')
    }

tokenized_rm_data = [tokenize_rm(d) for d in rm_data]

# RM training step (simplified)
optimizer = torch.optim.Adam(reward_model.parameters(), lr=1e-5)
reward_model.train()

for epoch in range(3):
    for sample in tokenized_rm_data:
        # Get tokenized inputs for preferred and other responses
        preferred_input = sample['preferred']['input_ids']
        other_input = sample['other']['input_ids']

        # Compute scalar reward scores
        preferred_scores = reward_model(preferred_input).logits
        other_scores = reward_model(other_input).logits

        # Calculate pairwise ranking loss matching the RM formula
        loss = -torch.log(torch.sigmoid(preferred_scores - other_scores)).mean()

        # update gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

## 3. Reinforcement Learning via PPO (RLHF)



In [None]:
import torch.nn.functional as F

# Initialize PPO policy from supervised fine-tuned model (SFT)
policy_model = model
policy_optimizer = torch.optim.Adam(policy_model.parameters(), lr=1e-5)
reward_model.eval()

ppo_prompt = "What is sunshine?"
tokenized_prompt = tokenizer(ppo_prompt, return_tensors="pt")

# Generate response using current policy model π_φ(y|x)
output_ids = policy_model.generate(
    tokenized_prompt['input_ids'], max_length=50, num_return_sequences=1
)
generated_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Tokenize complete prompt+response for reward calculation
tokenized_response = tokenizer(ppo_prompt + " " + generated_response, return_tensors="pt")

# Compute reward r_θ(x,y) using reward model
with torch.no_grad():
    reward_score = reward_model(tokenized_response['input_ids']).logits.item()

# Compute the KL penalty to regularize against original SFT policy
with torch.no_grad():
    # Probability distribution from current policy π_φ
    current_policy_logits = policy_model(tokenized_prompt['input_ids']).logits
    current_policy_probs = F.softmax(current_policy_logits, dim=-1)

    # Probability distribution from SFT policy π_θ
    sft_policy_logits = model(tokenized_prompt['input_ids']).logits
    sft_policy_probs = F.softmax(sft_policy_logits, dim=-1)

    # KL divergence
    kl_divergence = F.kl_div(current_policy_probs.log(), sft_policy_probs.log(), reduction='batchmean')

# Combine reward and KL-penalty into final reward as defined in PPO
beta = 0.01  # KL penalty coefficient
reward = reward_score - beta * kl_divergence.item()

# PPO policy loss (maximize reward → minimize negative reward)
loss = -reward

# Policy update step (gradient descent)
policy_optimizer.zero_grad()
loss.backward()
policy_optimizer.step()