<a href="https://colab.research.google.com/github/OneFineStarstuff/Cosmic-Brilliance/blob/main/ppo_finetune_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install trl

In [None]:
#!/usr/bin/env python3
"""
ppo_finetune.py

Fine‑tune a causal language model with PPO using a pretrained reward model.
"""

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
import random
import numpy as np

# -----------------------
# Config & seeds
# -----------------------
BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
REWARD_MODEL_DIR = "rm_final"
UNLABELED_CONTEXTS_FILE = "unlabeled_contexts.jsonl"
OUTPUT_DIR = "questioner_ppo"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# -----------------------
# Load policy & tokenizer
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token

policy = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto"
)
# Optionally reduce memory usage
policy.gradient_checkpointing_enable()

# -----------------------
# Load reward model
# -----------------------
reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_DIR,
    device_map="auto"
).eval()

# -----------------------
# PPO config & trainer
# -----------------------
ppo_config = PPOConfig(
    batch_size=128,
    ppo_epochs=4,
    learning_rate=1.1e-6,
    clip_range=0.2,
    target_kl=0.1,
)

trainer = PPOTrainer(
    config=ppo_config,
    model=policy,
    tokenizer=tokenizer,
    reward_model=reward_model
)

# -----------------------
# Prompt formatting
# -----------------------
def format_prompt(example):
    return f"""You are an AI generating deep, open-ended questions.

Context:
{example['context']}

Question:"""

# -----------------------
# Load unlabeled contexts
# -----------------------
dataset = load_dataset("json", data_files=UNLABELED_CONTEXTS_FILE)["train"]

# -----------------------
# PPO training loop
# -----------------------
for epoch in range(3):
    batch = random.sample(list(dataset), ppo_config.batch_size)
    prompts = [format_prompt(b) for b in batch]

    # Generate candidate questions
    gen_outputs = trainer.generate(
        prompts,
        max_new_tokens=64,
        temperature=1.0,
        top_p=0.95
    )

    questions = tokenizer.batch_decode(gen_outputs, skip_special_tokens=True)

    # Score with reward model
    rewards = []
    with torch.no_grad():
        for p, q in zip(prompts, questions):
            inp = tokenizer(
                f"{p} {q}",
                return_tensors="pt",
                truncation=True,
                max_length=128
            ).to(policy.device)
            reward = reward_model(**inp).logits.item()
            rewards.append(reward)

    # PPO update
    trainer.step(prompts, gen_outputs, rewards)

# -----------------------
# Save final model
# -----------------------
policy.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"PPO‑tuned model saved to {OUTPUT_DIR}")