# [Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)](https://arxiv.org/pdf/2305.18290.pdf)

### Reference Code
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

Therefore the final dataset object should contain these 3 entries if you use the default DPODataCollatorWithPadding data collator.

The entries should be named:
- prompt
- chosen
- rejected

In [1]:
!pip install trl==0.8.6 transformers==4.45.0



In [2]:
# check transformer and trl version
import transformers
import trl

# should be trl==0.8.6 transformers==4.45.0
print(transformers.__version__)
print(trl.__version__)

4.45.0
0.8.6


In [3]:
import os
import torch
# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# dpo_dataset_dict = {
#     "prompt": [
#         "hello",
#         "how are you",
#         "What is your name?",
#         "What is your name?",
#         "Which is the best programming language?",
#         "Which is the best programming language?",
#         "Which is the best programming language?",
#     ],
#     "chosen": [
#         "hi nice to meet you",
#         "I am fine",
#         "My name is Mary",
#         "My name is Mary",
#         "Python",
#         "Python",
#         "Java",
#     ],
#     "rejected": [
#         "leave me alone",
#         "I am not fine",
#         "Whats it to you?",
#         "I dont have a name",
#         "Javascript",
#         "C++",
#         "C++",
#     ],
# }

In [5]:
# !pip install bitsandbytes

In [6]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    BitsAndBytesConfig
)

from typing import Dict, Optional
from trl import DPOTrainer

# 1. load a pretrained model and tokenizer

In [7]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/_NLP/A5/NLP-A5-DPO')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Load model and tokenizer
model_name = "Qwen/Qwen2-1.5B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Native BF16
    device_map="auto"  # Auto-map to GPU
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

## 2. Load the Anthropic Helpful-Harmless dataset

In [9]:
# def extract_anthropic_prompt(prompt_and_response):
#     """Extract the anthropic prompt from a prompt and response pair."""
#     search_term = "\n\nAssistant:"
#     search_term_idx = prompt_and_response.rfind(search_term)
#     assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
#     return prompt_and_response[: search_term_idx + len(search_term)]

# def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:
#     """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format.

#     The dataset is converted to a dictionary with the following structure:
#     {
#         'prompt': List[str],
#         'chosen': List[str],
#         'rejected': List[str],
#     }

#     Prompts should be structured as follows:
#       \n\nHuman: <prompt>\n\nAssistant:
#     Multiple turns are allowed, but the prompt should always start with \n\nHuman: and end with \n\nAssistant:.
#     """

#     dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir)
#     if sanity_check:
#         dataset = dataset.select(range(min(len(dataset), 1000)))

#     def split_prompt_and_responses(sample) -> Dict[str, str]:
#         prompt = extract_anthropic_prompt(sample["chosen"])
#         return {
#             "prompt": prompt,
#             "chosen": sample["chosen"][len(prompt) :],
#             "rejected": sample["rejected"][len(prompt) :],
#         }

#     return dataset.map(split_prompt_and_responses)

In [20]:
from datasets import load_dataset, DatasetDict
from typing import Dict, List

def prepare_dataset(split: str = "train_prefs", test_size: float = 0.2, seed: int = 42, limit: int = None) -> DatasetDict:
    dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=split)

    def format_sample(example: Dict) -> Dict:
        # Handle None or empty chosen/rejected lists
        chosen_list = example["chosen"] if example["chosen"] is not None else []
        rejected_list = example["rejected"] if example["rejected"] is not None else []

        # Extract content with fallbacks
        chosen_content = next(
            (item["content"] for item in chosen_list if item.get("role") == "assistant" and item.get("content") is not None),
            next((item["content"] for item in chosen_list if item.get("content") is not None), "") if chosen_list else ""
        )
        rejected_content = " ".join(
            item["content"] for item in rejected_list if item.get("role") == "assistant" and item.get("content") is not None
        ) or (next((item["content"] for item in rejected_list if item.get("content") is not None), "") if rejected_list else "")

        # Ensure all fields are valid strings
        prompt = example["prompt"] if example["prompt"] is not None else ""
        chosen = chosen_content if chosen_content is not None else ""
        rejected = rejected_content if rejected_content is not None else ""

        return {
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected
        }

    # Apply formatting and filter
    formatted_dataset = dataset.map(format_sample)
    formatted_dataset = formatted_dataset.filter(
        lambda x: x["prompt"] is not None and x["chosen"] is not None and x["rejected"] is not None and
                  x["prompt"] != "" and x["chosen"] != "" and x["rejected"] != ""
    )

    if limit:
        formatted_dataset = formatted_dataset.select(range(min(limit, len(formatted_dataset))))

    train_test_split = formatted_dataset.train_test_split(test_size=test_size, seed=seed)
    return DatasetDict({
        "train": train_test_split["train"],
        "test": train_test_split["test"]
    })

In [21]:
dataset_dict = prepare_dataset(split="train_prefs", test_size=0.2, seed=42)

# Access train and eval datasets
train_dataset = dataset_dict["train"]
eval_dataset = dataset_dict["test"]

# Verify
print("Train dataset size:", len(train_dataset))
print("Eval dataset size:", len(eval_dataset))
print("First train example:", train_dataset[0])



Map:   0%|          | 0/61135 [00:00<?, ? examples/s]

Filter:   0%|          | 0/61135 [00:00<?, ? examples/s]

Train dataset size: 48892
Eval dataset size: 12223
First train example: {'prompt': "How did Neil Young's collaborations with musicians from different genres and backgrounds impact his music?", 'prompt_id': '08ad82a121278baf47d8ed3851e061a3bf302528fc986593cd10f174e6f010f3', 'chosen': 'Neil Young\'s collaborations with musicians from different genres and backgrounds have had a significant impact on his music throughout his long and storied career. These collaborations have not only influenced his sound but have also helped him to explore new musical territories and expand his musical horizons.\n\nOne of the most notable examples of Neil Young\'s collaborations is his work with the band Crazy Horse. This partnership has resulted in some of his most iconic and powerful rock songs, including "Cinnamon Girl," "Down by the River," and "Hey Hey, My My (Into the Black)." The raw, electric sound of Crazy Horse has been a major influence on Young\'s music, and their partnership has helped him to 

In [22]:
# Debug: Check for None or empty values
for i, example in enumerate(train_dataset):
    if any(v is None or v == "" for v in [example["prompt"], example["chosen"], example["rejected"]]):
        print(f"Invalid entry at train index {i}: {example}")
        break
else:
    print("No invalid entries in train_dataset")

for i, example in enumerate(eval_dataset):
    if any(v is None or v == "" for v in [example["prompt"], example["chosen"], example["rejected"]]):
        print(f"Invalid entry at eval index {i}: {example}")
        break
else:
    print("No invalid entries in eval_dataset")

No invalid entries in train_dataset
No invalid entries in eval_dataset


In [12]:
# # Optional: Convert to dictionary format
# dpo_dataset_dict = {
#     "prompt": train_dataset["prompt"],
#     "chosen": train_dataset["chosen"],
#     "rejected": train_dataset["rejected"]
# }

In [13]:
# print("\nDictionary format sample:")
# print("Prompt:", dpo_dataset_dict["prompt"][0])
# print("Chosen:", dpo_dataset_dict["chosen"][0])
# print("Rejected:", dpo_dataset_dict["rejected"][0])

# 3. initialize training arguments:

In [14]:
# learning_rate = 1e-3
# #per_device_train_batch_size = 8
# per_device_train_batch_size = 2

# gradient_accumulation_steps = 1

# #max_length= 512
# max_length= 64

# # max_prompt_length = 128
# # max_target_length =128
# max_prompt_length = 32
# max_target_length =32

# label_pad_token_id = 100
# #max_steps = 1000
# max_steps = 1000
# # instrumentation
# sanity_check = True
# report_to = None
# gradient_checkpointing = True
# beta = 0.1

In [23]:
training_args = TrainingArguments(
    output_dir="./qwen2_dpo_output",
    num_train_epochs=1,
    per_device_train_batch_size=1,  # Smallest possible batch size
    gradient_accumulation_steps=2,  # Effective batch size = 2
    learning_rate=5e-5,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=1,
    bf16=True,  # T4 supports BF16 natively
    gradient_checkpointing=True,  # Memory-efficient
    remove_unused_columns=False,
    optim="adamw_torch",
    warmup_steps=50,
    report_to="none",
    per_device_eval_batch_size=1,  # Low eval batch size
    max_grad_norm=0.5,  # Gradient clipping
    # Reduce sequence lengths further to save memory
    # These are passed to DPOTrainer, not TrainingArguments
)



# 4. initialize the DPO trainer

In [24]:
# Initialize DPOTrainer
dpo_trainer = DPOTrainer(
    model=model,
    args=training_args,
    beta=0.1,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=32,  # Reduced from 64
    max_prompt_length=16,  # Reduced from 32
    max_target_length=16,  # Reduced from 32
    generate_during_eval=True,
)



Map:   0%|          | 0/48892 [00:00<?, ? examples/s]

Map:   0%|          | 0/12223 [00:00<?, ? examples/s]

# 5. Train

-

In [25]:
dpo_trainer.train()

TypeError: 'NoneType' object cannot be interpreted as an integer

### Save Model

In [None]:
# dop_save_directory = "./trained_dpo_model"

# dpo_trainer.model.save_pretrained(dop_save_directory)
# dpo_trainer.tokenizer.save_pretrained(dop_save_directory)

# print(f"Model saved locally to {dop_save_directory}")

# 6.  Pushing the Model to Hugging Face Hub

In [None]:
# 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
from huggingface_hub import login
login(token="f0048382317eafb2d30d9f8319dc825416765a07") # remove due to github limitation

In [None]:
from trl import DPOTrainer

# Assuming `trainer` is your DPOTrainer instance
dpo_trainer.model.push_to_hub("nyeinchanaung/a5_dpo_qwen2", commit_message="DPO model upload")
dpo_trainer.tokenizer.push_to_hub("nyeinchanaung/a5_dpo_qwen2", commit_message="Tokenizer upload")

print(f"Model uploaded to https://huggingface.co/nyeinchanaung/a5_dpo_qwen2")