<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/Combining_Reinforcement_Learning_with_Language_Models_(RLHF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers trl torch

In [None]:
pip install transformers trl torch --upgrade

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from trl import PPOTrainer, PPOConfig, create_reference_model
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

# Define a simple custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        # Add pad token
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.add_tokens(["[PAD]"])
        self.tokenizer.model_max_length = 50

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = self.tokenizer(self.texts[idx], return_tensors="pt", padding='max_length', truncation=True, max_length=50)
        return {key: val.squeeze(0) for key, val in item.items()}

# Collate function to pad sequences to the same length
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    return {'input_ids': input_ids_padded, 'attention_mask': attention_mask_padded}

# Example data
texts = ["Hello, how are you?", "This is an example text.", "Reinforcement learning with transformers."]
dataset = CustomDataset(texts)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Ensure padding token is added
tokenizer.add_tokens(["[PAD]"])
tokenizer.model_max_length = 50
policy_model = GPT2LMHeadModel.from_pretrained("gpt2")
policy_model.resize_token_embeddings(len(tokenizer))
ref_model = create_reference_model(policy_model)

# Configuration for PPO
ppo_config = PPOConfig(
    learning_rate=1e-5,
    batch_size=2,
    mini_batch_size=1,
    output_dir="./ppo_output"
)

# Mock reward model for the sake of example
class MockRewardModel(torch.nn.Module):
    def forward(self, *args, **kwargs):
        return torch.tensor([1.0])

reward_model = MockRewardModel()

# Initialize PPOTrainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    policy=policy_model,
    ref_policy=ref_model,
    reward_model=reward_model,
    train_dataset=dataset,
    value_model=policy_model,
    processing_class=None  # Ensure you have the correct processing class as required
)

# Debugging: Ensure data is correctly passed
print("Dataset length:", len(data_loader.dataset))
for batch in data_loader:
    print("Batch input ids:", batch['input_ids'])
    print("Batch attention mask:", batch['attention_mask'])
    break  # Just inspect the first batch

# Training loop with manual loss calculation
optimizer = torch.optim.Adam(policy_model.parameters(), lr=ppo_config.learning_rate)

for epoch in range(3):  # Simulating 3 epochs
    for step, batch in enumerate(data_loader):
        batch = {k: v.to(ppo_trainer.accelerator.device) for k, v in batch.items()}
        outputs = policy_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
        loss = F.cross_entropy(outputs.logits.view(-1, outputs.logits.size(-1)), batch['input_ids'].view(-1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch}, Step {step}, Loss: {loss.item()}")

print("Reinforcement learning training completed.")