In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.optim import AdamW
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Fix missing padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS as padding token

model = AutoModelForCausalLM.from_pretrained(model_name)
old_model = AutoModelForCausalLM.from_pretrained(model_name)  # Reference model (θ_old)

# Resize embeddings after adding new tokens
model.resize_token_embeddings(len(tokenizer))

# Move models to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
old_model.to(device);

In [3]:
# https://huggingface.co/datasets/yitingxie/rlhf-reward-datasets
preference_data = load_dataset("yitingxie/rlhf-reward-datasets")

# Check available dataset splits
print(preference_data)

train_dataset = preference_data['train'].select(range(1000))
test_dataset = preference_data['test'].select(range(10))
print(train_dataset[0])

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 76256
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5103
    })
})
{'prompt': "\n\nHuman: Is it legal to have a dog that is half wolf?\n\nAssistant: It sounds like you’re asking whether it’s legal to have a dog that is partially wolf, as opposed to all wolf.  There’s a bit of ambiguity to your question.  What’s the context of your interest in the law?\n\nHuman: I'd like to get a puppy that is half wolf and half husky.\n\nAssistant: Ah, now I understand.  We can look up some details on the breed of wolf and husky dogs and see if they’re considered “hybrids”.\n\nHuman: They are called wolf dogs. It doesn't matter if you want to call them hybrids though.\n\nAssistant: That makes it sound like you’re seeking information to get an opinion on whether your dog is legal or not.\n\nHuman: Yes, does it depend on the state?\n\n", 'chosen': 'As

In [4]:
def tokenize_texts(prompt, response):
    """Tokenizes prompt + response pairs."""
    inputs = tokenizer(prompt, response, return_tensors="pt", padding=True, truncation=True)
    return inputs.input_ids, inputs.attention_mask


In [5]:
def compute_log_prob(model, tokenizer, prompt, response):
    """Computes log probability of response given a prompt using the model."""
    inputs = tokenizer(prompt, response, return_tensors="pt", padding=True, truncation=True)

    # Ensure attention mask is used
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)  # Fix missing attention mask

    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits[:, :-1, :]  # Ignore last token logits
    # print("logits:", logits)
    # Compute log probabilities
    log_probs = F.log_softmax(logits, dim=-1)
    # print("log_probs:", log_probs)
    log_probs = torch.clamp(log_probs, min=-10, max=10)  # Prevent log(0)
    
    response_log_probs = log_probs.gather(2, input_ids[:, 1:].unsqueeze(-1)).squeeze(-1).sum(dim=1)

    return response_log_probs.mean()  # Now works correctly with padding

def dpo_loss(model, old_model, tokenizer, prompt, preferred, rejected, beta=1.0):
    """Computes the Direct Preference Optimization (DPO) loss."""
    
    # Compute log probabilities of preferred and rejected responses
    log_prob_w = compute_log_prob(model, tokenizer, prompt, preferred)
    log_prob_l = compute_log_prob(model, tokenizer, prompt, rejected)

    # Compute log probabilities under the reference model
    log_prob_w_old = compute_log_prob(old_model, tokenizer, prompt, preferred)
    log_prob_l_old = compute_log_prob(old_model, tokenizer, prompt, rejected)

    # Compute winning and losing ratios
    winning_ratio = beta * (log_prob_w - log_prob_w_old)
    loss_ratio = beta * (log_prob_l - log_prob_l_old)

    # Compute final DPO loss
    loss = -torch.log(torch.sigmoid(winning_ratio - loss_ratio))
    
    return loss

In [6]:
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define gradient accumulation steps
gradient_accumulation_steps = 16  # Adjust to control memory usage

num_epochs = 3  # Increase for better results
batch_size = 4  # Mini-batch size before accumulation
print_interval = 128

for epoch in range(num_epochs):
    total_loss = 0
    
    for i in range(0, len(train_dataset), batch_size):
        
        batch = train_dataset[i:i+batch_size]
        optimizer.zero_grad() if (i % (batch_size * gradient_accumulation_steps)) == 0 else None

        # Compute DPO loss
        loss = dpo_loss(model, old_model, tokenizer, batch["prompt"], batch["chosen"], batch["rejected"])
        loss = loss.mean()  # Ensure loss is a scalar

        # Normalize loss by accumulation steps
        loss = loss / gradient_accumulation_steps  
        loss.backward()  # Accumulate gradients

        # Perform optimizer step after accumulation steps
        if (i // batch_size + 1) % gradient_accumulation_steps == 0:
            optimizer.step()  # Update model weights
            optimizer.zero_grad()  # Clear gradients

        # Print loss occasionally
        if i % print_interval == 0:
            print(f"Epoch {epoch}, Iteration {i}: loss - {loss.item()}")

        total_loss += loss.item() * gradient_accumulation_steps  # Rescale loss

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataset):.4f}")

Epoch 0, Iteration 0: loss - 0.043321698904037476
Epoch 0, Iteration 128: loss - 0.0017726789228618145
Epoch 0, Iteration 256: loss - 0.0
Epoch 0, Iteration 384: loss - 0.00018086351337842643
Epoch 0, Iteration 512: loss - 0.0
Epoch 0, Iteration 640: loss - 0.0010004452196881175
Epoch 0, Iteration 768: loss - 1.2535934448242188
Epoch 0, Iteration 896: loss - 0.03670274466276169
Epoch 1, Loss: 0.8186
Epoch 1, Iteration 0: loss - 5.960467319710006e-08
Epoch 1, Iteration 128: loss - 0.01660657487809658
Epoch 1, Iteration 256: loss - 0.11035837233066559
Epoch 1, Iteration 384: loss - 0.006150395609438419
Epoch 1, Iteration 512: loss - 0.0
Epoch 1, Iteration 640: loss - 0.00031707261223345995
Epoch 1, Iteration 768: loss - 1.1229934692382812
Epoch 1, Iteration 896: loss - 0.017250964418053627
Epoch 2, Loss: 0.4754
Epoch 2, Iteration 0: loss - 2.980239344196889e-07
Epoch 2, Iteration 128: loss - 6.162431964185089e-05
Epoch 2, Iteration 256: loss - 0.0
Epoch 2, Iteration 384: loss - 0.0001873

In [7]:
def generate_response(prompt):
    """Generates a response from the fine-tuned model."""
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    output = model.generate(input_ids, max_length=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the fine-tuned model
prompt = test_dataset[0]["prompt"]
preferred = test_dataset[0]["chosen"]
print("Prompt:", prompt)
print("Chosen:", preferred)
print("Generated Response:", generate_response(prompt))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: 

Human: Do you know why turkeys became the official food of thanksgiving?


Chosen: Assistant: To be honest, I don’t know anything about that. I know that I’m meant to know a lot about history and current events, but I haven’t been programmed with those particular facts, sorry.
Generated Response: 

Human: Do you know why turkeys became the official food of thanksgiving?


Turkeys: Because they were the first to eat the food of thanksgiving. They were the first to eat the food of thanksgiving. They were
