In [None]:
!pip install transformers datasets peft trl torch

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertForSequenceClassification, BertTokenizer
from peft import get_peft_model
import torch

# Load the pre-trained baby LLM (e.g., GPT-2)
tokenizer = GPT2Tokenizer.from_pretrained("path_to_your_peft_baby_llm_tokenizer")
baby_llm = GPT2LMHeadModel.from_pretrained("path_to_your_peft_baby_llm_model")

# Load the pre-trained reward model
reward_model = BertForSequenceClassification.from_pretrained("path_to_your_reward_model")
tokenizer_reward = BertTokenizer.from_pretrained("bert-base-uncased")


In [None]:
from datasets import Dataset

# Example data
data = [
    {"context": "Context 1", "question": "Question 1"},
    {"context": "Context 2", "question": "Question 2"},
    # Add more examples...
]

dataset = Dataset.from_list(data)

def tokenize_function(examples):
    return tokenizer(
        examples["question"], 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)



In [None]:
from trl import PPOTrainer, PPOConfig

# Define PPO configuration
ppo_config = PPOConfig(
    batch_size=16,
    learning_rate=1.41e-5,
    log_with="tensorboard"
)


In [None]:
# Initialize PPO Trainer
ppo_trainer = PPOTrainer(model=baby_llm, config=ppo_config, tokenizer=tokenizer)


In [None]:
def generate_and_evaluate(batch):
    inputs = tokenizer(batch["question"], return_tensors="pt", padding=True, truncation=True)
    outputs = baby_llm.generate(inputs["input_ids"], max_length=100)
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    rewards = []

    for i, text in enumerate(generated_texts):
        context_question =  batch["question"][i]
        reward_input = tokenizer_reward(context_question, text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            score = reward_model(**reward_input).logits.squeeze().item()
        rewards.append(score)

    return generated_texts, rewards


In [None]:
sample_batch = {   
    "question": ["Question 1", "Question 2"]
}


In [None]:
for epoch in range(ppo_config.total_epochs):
    for _ in range(len(tokenized_dataset) // ppo_config.batch_size):
        generated_texts, rewards = generate_and_evaluate(sample_batch)
        ppo_trainer.step(sample_batch["question"], generated_texts, rewards)
