Code to train a model based on RLHF pipeline. Takes as input a pairwise preference pandas dataset as a csv with columns "prompt", "chosen", and "rejected". Trains a given RM on the dataset using loss as defined in https://arxiv.org/pdf/2203.02155 (Ouyang et al 2022, Training language models to follow instructions with human feedback).  Runs in Google Colab

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from google.colab import drive
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [None]:
drive.mount('/content/drive')

Edit Output Paths Here:

In [None]:
# Name of model in transformers library
model_name = ""
# Name of model that was used to create calibrated dataset
training_name = ""
# Directory of untrained model (if applicable)
model_dir = ""
# Directory to load in dataset
data_dir = ""
# Directory to save trained model
output_dir = ""
# Directory to save loss data
loss_dir = ""

In [None]:
# Load in tokenizer from transformers library
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Tokenize and format dataset for training
def preprocess(sample):
    prompt = str(sample["prompt"])
    chosen = str(sample["chosen"])
    rejected = str(sample["rejected"])

    chosen_input = tokenizer(prompt + chosen, truncation=True, padding="max_length", return_tensors="pt", )
    rejected_input = tokenizer(prompt + rejected, truncation=True, padding="max_length", return_tensors="pt")

    return {
        "input_ids_chosen": chosen_input["input_ids"].squeeze(),
        "attention_mask_chosen": chosen_input["attention_mask"].squeeze(),
        "input_ids_rejected": rejected_input["input_ids"].squeeze(),
        "attention_mask_rejected": rejected_input["attention_mask"].squeeze(),
    }

# Load in dataframe
df = pd.read_csv(data_dir)
processed = df.apply(preprocess, axis=1)

dataset = list(processed)

In [None]:
# Custom reward model class. Forward class requires modification for different LLM output pipelines
class RewardModel(nn.Module):
    def __init__(self, load_dir = None):
        super().__init__()
        
        if load_dir is None:
          self.model = AutoModelForCausalLM.from_pretrained("Ray2333/GRM-Gemma-2B-sftreg")
        else:
          self.model = AutoModelForCausalLM.from_pretrained(load_dir)

        self.model.gradient_checkpointing_enable()
        self.config = self.model.config
        self.v_head = nn.Linear(self.model.config.hidden_size, 1)

        if load_dir:
            v_head_path = f"{load_dir}/v_head.pt"
            try:
                self.v_head.load_state_dict(torch.load(v_head_path, map_location="cpu"))
                print("Loaded v_head from:", v_head_path)
            except FileNotFoundError:
                print("No saved v_head found — starting from scratch.")

    def forward(self, input_ids, attention_mask, **kwargs):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        reward = output.logits.squeeze(-1)
        return reward
    def save_pretrained(self, save_directory):
        self.model.save_pretrained(save_directory)
        torch.save(self.v_head.state_dict(), f"{save_directory}/v_head.pt")
        print("Model and v_head saved to", save_directory)

In [None]:
# Pairwise loss as defined by RLHF pipeline
def pairwise_loss(chosen_rewards, rejected_rewards):
    return -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards)).mean()

In [None]:
# Initialize reward model and use dataloader to format dataset
model = RewardModel().to("cuda")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
# Training loop - requires accelerator and/or peft for bigger models
losses = []
model.train()
i = 0
for batch in dataloader:

    chosen_rewards = model(batch["input_ids_chosen"].to("cuda"), batch["attention_mask_chosen"].to("cuda"))
    rejected_rewards = model(batch["input_ids_rejected"].to("cuda"), batch["attention_mask_rejected"].to("cuda"))

    loss = pairwise_loss(chosen_rewards, rejected_rewards)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    print(f"{i}: Loss: {loss.item()}")
    i+=1
    torch.cuda.empty_cache()

In [None]:
# Save models
model.save_pretrained(f"{output_dir}_model")
tokenizer.save_pretrained("{output_dir}_tokenizer")

In [None]:
# Plot the training loss
plt.figure(figsize=(10, 6))
plt.plot(losses, label="Training Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title(f"Training Loss: {model_name} trained on {training_name}")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Save loss as pandas dataframe
loss_df = pd.DataFrame({"step": list(range(len(losses))), "loss": losses})
loss_df.to_csv(loss_dir, index=False)