# [Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)](https://arxiv.org/pdf/2305.18290.pdf)

### Reference Code 
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

Therefore the final dataset object should contain these 3 entries if you use the default DPODataCollatorWithPadding data collator. 

The entries should be named:
- prompt
- chosen
- rejected

In [1]:
import os
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    HfArgumentParser, 
    TrainingArguments,
)

from typing import Dict, Optional
from trl import DPOTrainer, DPOConfig

In [2]:
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# 1. Load and preprocess the dataset

In [None]:
def extract_prompt(prompt_and_response):
    """Extract the prompt from a prompt and response pair."""
    search_term = "\nAssistant:" # Define the search term to locate the Assistant's response
    search_term_idx = prompt_and_response.rfind(search_term) # Find the last occurrence of the search term in the text
    assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'" # Ensure the search term is found
    return prompt_and_response[: search_term_idx + len(search_term)]

def get_rm_static(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:
    """Load the Dahoas/rm-static dataset and preprocess it."""
    dataset = load_dataset("Dahoas/rm-static", split=split, cache_dir=cache_dir)
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    def split_prompt_and_responses(sample) -> Dict[str, str]:
        if "prompt" not in sample:
            raise ValueError("Sample does not contain 'prompt' key.")
        prompt = extract_prompt(sample["prompt"])
        return {
            "prompt": prompt,
            "chosen": sample["chosen"],
            "rejected": sample["rejected"],
        }

    # Apply the transformation
    dataset = dataset.map(split_prompt_and_responses)

    # Verify the dataset has the correct columns
    if not all(col in dataset.column_names for col in ["prompt", "chosen", "rejected"]):
        raise ValueError("Dataset is missing required columns after transformation.")

    # Remove the 'response' column
    if 'response' in dataset.column_names:
        dataset = dataset.remove_columns('response')

    # Filter out invalid samples
    dataset = dataset.filter(lambda x: all(key in x for key in ['prompt', 'chosen', 'rejected']))

    return dataset

In [4]:
sanity_check = True
train_dataset = get_rm_static("train", sanity_check=sanity_check)
eval_dataset = get_rm_static("test", sanity_check=sanity_check)

In [5]:
train_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})

In [6]:
print("Train dataset example:")
print(train_dataset[0])

Train dataset example:
{'prompt': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:', 'chosen': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.', 'rejected': ' Yes, you can spray it directly onto the cloth.'}


In [7]:
eval_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})

In [8]:
print("Eval dataset example:")
print(eval_dataset[0])

Eval dataset example:
{'prompt': "\n\nHuman: I am trying to write a fairy tale. What is the most popular plot?\n\nAssistant: The most popular plot might be “a princess goes to a faraway land, falls in love, and has a magic genie that grants her wishes”.  We can find plenty of examples of this if we search for fairy tales using the search engine Google.\n\nHuman: You can't look anything up on google.\n\nAssistant: OK, so let’s think about some other popular fairy tales.\n\nHuman: Do you like the plot of Hansel and Gretel?\n\nAssistant: Not particularly.\n\nHuman: what don't you like?\n\nAssistant: I feel like the children do very stupid things.\n\nHuman: Should I include a witch on my story?\n\nAssistant: I think a witch is a fairly common fairy tale element, so yes.  However, what kind of plotline would you like to pursue?\n\nHuman: I was thinking I might pursue a feminist plot for a more modern fairytale.\n\nAssistant: I would be happy to help you make your story a more feminist one. 

# 2. Initialize model and tokenizer

In [None]:
model_name_or_path = "gpt2"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path) # Load the model from pre-trained weights
if ignore_bias_buffers: # Ignore boolean buffers (e.g., bias) during distributed training
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path) # Load a reference copy of the model for comparison
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) # Load the tokenizer for GPT-2
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # Set the padding token to the EOS token if not already defined

# 3. Initialize training arguments

In [None]:
learning_rate = 1e-3
per_device_train_batch_size = 4 # Reduced batch size for memory efficiency
gradient_accumulation_steps = 4 # Gradients accumulate over 4 steps
max_length= 512 
max_prompt_length = 128 # for input
max_target_length =128 # for output
label_pad_token_id = 100 # Padding token ID used for labels during training
max_steps = 400 # Maximum number of training steps
# instrumentation
sanity_check = True
report_to = None # Specify where to report metrics, if any
gradient_checkpointing = False # Whether to use gradient checkpointing to save memory
beta = 0.1 # Hyperparameter for DPO (direct preference optimization)

In [None]:
training_args = DPOConfig(
    per_device_train_batch_size=per_device_train_batch_size,
    max_steps=max_steps,
    remove_unused_columns=False, # Keep all columns
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    eval_strategy="steps",
    logging_first_step=True,
    logging_steps=5,  # match results in blog post; log every 5 steps
    eval_steps=500,
    output_dir="./dpo_model", # Directory to save the model and outputs
    optim="rmsprop",
    warmup_steps=150, # Warmup for the first 150 steps
    report_to=report_to, # Reporting destination
    bf16=False,
    gradient_checkpointing=gradient_checkpointing,
    # TODO: uncomment that on the next transformers release
    # gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
)

# 4. Initialize the DPO trainer

In [12]:
# !pip install trl==0.12
# !pip install transformers==4.46

In [13]:
dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args = training_args,
    beta = beta,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    max_length = max_length,
    max_target_length = max_target_length,
    max_prompt_length = max_prompt_length,
    generate_during_eval = True,
)


Deprecated positional argument(s) used in DPOTrainer, please use the DPOConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


# 5. Training

In [14]:
dpo_trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mprapatsorn-along[0m ([33mprapatsorn-along-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


TrainOutput(global_step=400, training_loss=3.1107826730229133, metrics={'train_runtime': 430.4389, 'train_samples_per_second': 14.869, 'train_steps_per_second': 0.929, 'total_flos': 0.0, 'train_loss': 3.1107826730229133, 'epoch': 6.4})

# 6. Save and push the model to Hugging Face Hub

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the saved model and tokenizer from checkpoint-400
model = AutoModelForCausalLM.from_pretrained("./dpo_model/checkpoint-400")
tokenizer = AutoTokenizer.from_pretrained("./dpo_model/checkpoint-400")



In [None]:
# from huggingface_hub import login

# # Replace with your token
# login(token="")

In [None]:
# Hugging Face repository information
huggingface_username = "prapatsorn456" 
model_name = "dahoas-rm-static-dpo-gpt2"  # Model name for the Hugging Face repository
repo_name = f"{huggingface_username}/{model_name}"

# Push the model and tokenizer to Hugging Face Hub
print(f"Pushing model and tokenizer to Hugging Face Hub as '{repo_name}'")

# Make sure the repository is public by setting `private=False`
model.push_to_hub(repo_name, private=False)
tokenizer.push_to_hub(repo_name, private=False)

# Print the link
model_url = f"https://huggingface.co/{repo_name}"
print(f"Model and tokenizer successfully pushed to Hugging Face Hub: {model_url}")

Pushing model and tokenizer to Hugging Face Hub as 'prapatsorn456/dahoas-rm-static-dpo-gpt2'


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model and tokenizer successfully pushed to Hugging Face Hub: https://huggingface.co/prapatsorn456/dahoas-rm-static-dpo-gpt2
