In [None]:
from trl import AutoModelForCausalLMWithValueHead
from trl.experimental.ppo import PPOConfig, PPOTrainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# 1. Configuration
config = PPOConfig(
    output_dir="./ppo-results",
    learning_rate=1.41e-5,
    num_ppo_epochs=4,
    kl_coef=0.05,
    batch_size=128,
    mini_batch_size=16,
    bf16=False,
)

# 2. Model & Tokenizer Loading
model_id = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load the main model with Value Head
model = AutoModelForCausalLMWithValueHead.from_pretrained(model_id)

# --- THE FIX FOR THE ATTRIBUTE ERROR ---
# Manually attach the generation_config from the internal model
model.generation_config = model.pretrained_model.generation_config
# ----------------------------------------

# Load the 4 mandatory missing components for experimental trainer
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(model_id)
reward_model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")
value_model = model # Use the existing WithValueHead model as the critic
dataset = load_dataset("trl-lib/Capybara", split="train[:128]")

# 3. Trainer Initialization
ppo_trainer = PPOTrainer(
    args=config,                # Argument 1
    processing_class=tokenizer, # Argument 2
    model=model,                # Argument 3
    ref_model=ref_model,        # Argument 4
    reward_model=reward_model,  # Argument 5
    train_dataset=dataset,      # Argument 6
    value_model=value_model     # Argument 7
)