# [Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)](https://arxiv.org/pdf/2305.18290.pdf)

### Reference Code
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

Therefore the final dataset object should contain these 3 entries if you use the default DPODataCollatorWithPadding data collator.

The entries should be named:
- prompt
- chosen
- rejected

In [None]:
!pip install trl==0.8.6 transformers==4.45.0



In [None]:
# check transformer and trl version
import transformers
import trl

# should be trl==0.8.6 transformers==4.45.0
print(transformers.__version__)
print(trl.__version__)

4.45.0
0.8.6


In [None]:
import os
import torch
# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# dpo_dataset_dict = {
#     "prompt": [
#         "hello",
#         "how are you",
#         "What is your name?",
#         "What is your name?",
#         "Which is the best programming language?",
#         "Which is the best programming language?",
#         "Which is the best programming language?",
#     ],
#     "chosen": [
#         "hi nice to meet you",
#         "I am fine",
#         "My name is Mary",
#         "My name is Mary",
#         "Python",
#         "Python",
#         "Java",
#     ],
#     "rejected": [
#         "leave me alone",
#         "I am not fine",
#         "Whats it to you?",
#         "I dont have a name",
#         "Javascript",
#         "C++",
#         "C++",
#     ],
# }

In [None]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments
)

from typing import Dict, Optional
from trl import DPOTrainer

# 1. load a pretrained model and tokenizer

In [None]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/_NLP/A5/NLP-A5-DPO')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# model_name_or_path = "gpt2"
model_name_or_path = "gpt2"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
if ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

## 2. Load the Anthropic Helpful-Harmless dataset

In [None]:
# def extract_anthropic_prompt(prompt_and_response):
#     """Extract the anthropic prompt from a prompt and response pair."""
#     search_term = "\n\nAssistant:"
#     search_term_idx = prompt_and_response.rfind(search_term)
#     assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
#     return prompt_and_response[: search_term_idx + len(search_term)]

# def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:
#     """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format.

#     The dataset is converted to a dictionary with the following structure:
#     {
#         'prompt': List[str],
#         'chosen': List[str],
#         'rejected': List[str],
#     }

#     Prompts should be structured as follows:
#       \n\nHuman: <prompt>\n\nAssistant:
#     Multiple turns are allowed, but the prompt should always start with \n\nHuman: and end with \n\nAssistant:.
#     """

#     dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir)
#     if sanity_check:
#         dataset = dataset.select(range(min(len(dataset), 1000)))

#     def split_prompt_and_responses(sample) -> Dict[str, str]:
#         prompt = extract_anthropic_prompt(sample["chosen"])
#         return {
#             "prompt": prompt,
#             "chosen": sample["chosen"][len(prompt) :],
#             "rejected": sample["rejected"][len(prompt) :],
#         }

#     return dataset.map(split_prompt_and_responses)

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from typing import Dict, List

def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:

    # Load the ultrafeedback_binarized dataset
    dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=split, cache_dir=cache_dir)

    # Apply sanity check to limit dataset size if requested
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 50000)))

    # Format the dataset to include role fields
    def format_sample(sample) -> Dict[str, str]:
        return {
            "prompt": sample["prompt"],
            "chosen": sample["chosen"][1]["content"],
            "rejected": sample["rejected"][1]["content"],
        }

    # Map the formatting function
    return dataset.map(format_sample)

def split_train_test(dataset: Dataset, test_size: float = 0.2, seed: int = 42) -> DatasetDict:

    # Perform train/test split
    train_test_split = dataset.train_test_split(test_size=test_size, seed=seed)
    return DatasetDict({
        "train": train_test_split["train"],
        "test": train_test_split["test"]
    })


In [None]:
sanity_check = True
train_dataset = get_hh("train_prefs", sanity_check=sanity_check)
eval_dataset = get_hh("test_prefs", sanity_check=sanity_check)

In [None]:
train_dataset

Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 50000
})

In [None]:
eval_dataset

Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
    num_rows: 2000
})

# 3. initialize training arguments:

In [None]:
learning_rate = 1e-3
#per_device_train_batch_size = 8
per_device_train_batch_size = 2

gradient_accumulation_steps = 1

#max_length= 512
max_length= 64

# max_prompt_length = 128
# max_target_length =128
max_prompt_length = 32
max_target_length =32

label_pad_token_id = 100
#max_steps = 1000
max_steps = 1000
# instrumentation
sanity_check = True
report_to = None
gradient_checkpointing = True
beta = 0.1

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=per_device_train_batch_size,
    max_steps=max_steps,
    remove_unused_columns=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    logging_first_step=True,
    logging_steps=5,  # match results in blog post
    eval_steps=500,
    output_dir="./test",
    optim="rmsprop",
    warmup_steps=150,
    report_to=report_to,
    bf16=True,
    gradient_checkpointing=gradient_checkpointing,
    # TODO: uncomment that on the next transformers release
    # gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
)



# 4. initialize the DPO trainer

In [148]:
dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=beta,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=max_length,
    max_target_length=max_target_length,
    max_prompt_length=max_prompt_length,
    generate_during_eval=True,
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


# 5. Train

-

In [None]:
dpo_trainer.train()

Step,Training Loss,Validation Loss


Trainer is attempting to log a value of "<wandb.sdk.data_types.table.Table object at 0x7a04ae146210>" of type <class 'wandb.sdk.data_types.table.Table'> for key "train/game_log" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


### Save Model

In [None]:
# dop_save_directory = "./trained_dpo_model"

# dpo_trainer.model.save_pretrained(dop_save_directory)
# dpo_trainer.tokenizer.save_pretrained(dop_save_directory)

# print(f"Model saved locally to {dop_save_directory}")

# 6.  Pushing the Model to Hugging Face Hub

In [None]:
# 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
from huggingface_hub import login
login(token="<<secret_KEY>>") # remove due to github limitation

In [None]:
from trl import DPOTrainer

# Assuming `trainer` is your DPOTrainer instance
dpo_trainer.model.push_to_hub("nyeinchanaung/a5_dpo_model", commit_message="DPO model upload")
dpo_trainer.tokenizer.push_to_hub("nyeinchanaung/a5_dpo_model", commit_message="Tokenizer upload")

print(f"Model uploaded to https://huggingface.co/nyeinchanaung/a5_dpo_model")