In [None]:
# 15 - RLHF Basics with trl (HuggingFace)
# pip install trl datasets transformers accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import PPOTrainer, PPOConfig
from datasets import load_dataset
import torch

# Load a small model for demo (you can also use 'gpt2')
model_name = "lvwerra/gpt2-imdb"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

config = PPOConfig(
    model_name=model_name,
    learning_rate=1.41e-5,
    log_with=None,
    batch_size=2,
    mini_batch_size=1,
)

ppo_trainer = PPOTrainer(config, model, tokenizer)

# Load a sample dataset
dataset = load_dataset("imdb", split="train[:10]")  # keep small for demo

for example in dataset:
    prompt = example["text"][:100]  # limit input length
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=20)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Reward function (very naive): reward longer outputs
    reward = torch.tensor([float(len(decoded_output)) / 100.0])

    # Run PPO step
    ppo_trainer.step([prompt], [decoded_output], reward)
    print(f"Trained on: {prompt[:30]}... => Reward: {reward.item():.2f}")
