# Optimization Human Preference[Direct Preference Optimization:(DPO)]

### Reference Code
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

The entries name are:
- prompt
- chosen
- rejected

In [1]:
pip install datasets



In [2]:
!pip cache purge
!pip install --no-cache-dir transformers==4.45.0 trl==0.8.6


[0mFiles removed: 0


In [3]:
import os
import torch
# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
dpo_dataset_dict = {
    "prompt": [
        "hello",
        "how are you",
        "What is your name?",
        "What is your name?",
        "Which is the best programming language?",
        "Which is the best programming language?",
        "Which is the best programming language?",
    ],
    "chosen": [
        "hi nice to meet you",
        "I am fine",
        "My name is Mary",
        "My name is Mary",
        "Python",
        "Python",
        "Java",
    ],
    "rejected": [
        "leave me alone",
        "I am not fine",
        "Whats it to you?",
        "I dont have a name",
        "Javascript",
        "C++",
        "C++",
    ],
}

In [5]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments
)

from typing import Dict, Optional
from trl import DPOTrainer

# load a pretrained model and tokenizer

In [6]:
model_name_or_path = "gpt2"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
if ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

## Load the Anthropic Helpful-Harmless dataset

In [7]:
from datasets import load_dataset

ds = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned")
print(ds)


DatasetDict({
    train: Dataset({
        features: ['source', 'prompt', 'chosen', 'chosen-rating', 'chosen-model', 'rejected', 'rejected-rating', 'rejected-model'],
        num_rows: 60917
    })
})


In [8]:
from datasets import load_dataset
from typing import Dict
from sklearn.model_selection import train_test_split
from datasets import Dataset

def extract_text(content):
    """Extracts text from a list of dictionaries or returns the string as-is."""
    if isinstance(content, list):
        return " ".join([c.get("content", "") for c in content]).strip()
    if isinstance(content, dict):
        return content.get("content", "").strip()
    return str(content).strip()

def remove_prompt_from_response(prompt: str, response: str) -> str:
    """Removes the prompt from the beginning of the response, if present."""
    if response.startswith(prompt):
        return response[len(prompt):].strip()  # Remove prompt from response
    return response.strip()  # Return as-is if no match

def load_and_process_dataset():
    """Load and process the dataset, ensuring correct extraction of prompt, chosen, and rejected."""

    dataset = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned", split="train")

    def split_prompt_and_responses(sample) -> Dict[str, str]:
        # Use the 'prompt' column directly (already a clean string)
        prompt_text = str(sample["prompt"]).strip()

        # Extract chosen and rejected responses
        chosen_text = extract_text(sample["chosen"])
        rejected_text = extract_text(sample["rejected"])

        # Remove duplicated prompt from responses
        chosen_text_cleaned = remove_prompt_from_response(prompt_text, chosen_text)
        rejected_text_cleaned = remove_prompt_from_response(prompt_text, rejected_text)

        return {
            "prompt": prompt_text,
            "chosen": chosen_text_cleaned,
            "rejected": rejected_text_cleaned,
        }

    # Process dataset and remove unused columns
    dataset = dataset.map(split_prompt_and_responses)
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["prompt", "chosen", "rejected"]])

    return dataset

# Load processed dataset
dataset = load_and_process_dataset()

# Convert to Pandas DataFrame for splitting
df = dataset.to_pandas()

# Split into train (80%) and test (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert back to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Print dataset info
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

# Verify first 3 samples
for i in range(2):
    print(f"Sample {i+1}:")
    print(f"Prompt:\n{train_dataset[i]['prompt']}\n")
    print(f"Chosen Response:\n{train_dataset[i]['chosen']}\n")
    print(f"Rejected Response:\n{train_dataset[i]['rejected']}\n")
    print("="*80)


Train size: 48733, Test size: 12184
Sample 1:
Prompt:
Please provide an updated report on the current state of air pollution in urban areas of the Philippines under Aquino's governance.

Chosen Response:
I can provide you with the most accurate information available until aquino's governance. during aquino's governance, air pollution in urban areas of the philippines was a significant issue. the main sources of air pollution were transportation, industrial activities, and residential burning of solid waste. according to the world health organization, in 2015, manila, the capital city of the philippines, was ranked as one of the most polluted cities in the world. the air quality index (aqi) in urban areas often exceeded the allowed maximum levels, causing respiratory illnesses and other health problems.

despite having introduced several measures to reduce air pollution, such as the clean air act of 1999 and the establishment of the national clean air program, aquino's governance failed

In [10]:
# sanity_check = True
# train_dataset = get_hh("train", sanity_check=sanity_check)
# eval_dataset = get_hh("test", sanity_check=sanity_check)

In [11]:
train_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected', '__index_level_0__'],
    num_rows: 48733
})

In [12]:
test_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected', '__index_level_0__'],
    num_rows: 12184
})

In [13]:
# Convert back to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

# Ensure unwanted columns are removed
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in ["prompt", "chosen", "rejected"]])
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names if col not in ["prompt", "chosen", "rejected"]])

# Print dataset info to confirm fix
print(train_dataset)
print(test_dataset)


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 48733
})
Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 12184
})


In [14]:
# Convert the original processed dataset to a dictionary
original_dict = dataset.to_dict()

# Print first 3 samples from the original dataset
for i in range(min(3, len(original_dict["prompt"]))):
    print(f">>>>>>>>Prompt: {original_dict['prompt'][i]}")
    print(f">>>>>>>>?Chosen: {original_dict['chosen'][i]}")
    print(f">>>>>>>>Rejected: {original_dict['rejected'][i]}")
    print("=" * 50)



>>>>>>>>Prompt: Can you write a C++ program that prompts the user to enter the name of a country and checks if it borders the Mediterranean Sea? Here's some starter code to help you out:
#include <iostream>
#include <string>
using namespace std;
int main() {
    string country;
    // prompt user for input
    cout << "Enter the name of a country: ";
    cin >> country;
    // check if country borders the Mediterranean Sea
    // [C++ code]
    return 0;
}
>>>>>>>>?Chosen: Here's a C++ program that prompts the user to enter the name of a country and checks if it borders the Mediterranean Sea:

#include <iostream>
#include <string>
#include <set>
#include <map>
#include <algorithm>

using namespace std;

int main() {
    // store countries and their bordering seas in a map
    map<string, set<string>> countries;
    countries["Algeria"] = {"Mediterranean Sea", "North African Coast"};
    countries["France"] = {"Mediterranean Sea", "English Channel"};
    countries["Italy"] = {"Mediterra

In [15]:
# Convert the train dataset to a dictionary
train_dict = train_dataset.to_dict()

# Print first 3 samples
for i in range(min(3, len(train_dict["prompt"]))):
    print(f">>>>Prompt: {train_dict['prompt'][i]}")
    print(f">>>>Chosen: {train_dict['chosen'][i]}")
    print(f">>>>Rejected: {train_dict['rejected'][i]}")
    print("=" * 50)


>>>>Prompt: Please provide an updated report on the current state of air pollution in urban areas of the Philippines under Aquino's governance.
>>>>Chosen: I can provide you with the most accurate information available until aquino's governance. during aquino's governance, air pollution in urban areas of the philippines was a significant issue. the main sources of air pollution were transportation, industrial activities, and residential burning of solid waste. according to the world health organization, in 2015, manila, the capital city of the philippines, was ranked as one of the most polluted cities in the world. the air quality index (aqi) in urban areas often exceeded the allowed maximum levels, causing respiratory illnesses and other health problems.

despite having introduced several measures to reduce air pollution, such as the clean air act of 1999 and the establishment of the national clean air program, aquino's governance failed to significantly reduce the levels of air pollu

# initialize training arguments:

In [None]:
# learning_rate = 1e-3
# per_device_train_batch_size = 4 #8
# gradient_accumulation_steps = 1
# max_length= 128 #512
# max_prompt_length = 64 #128
# max_target_length = 64 #128
# label_pad_token_id = 100
# max_steps = 1000 #1000
# # instrumentation
# sanity_check = True
# report_to = None
# gradient_checkpointing = None
# beta = 0.1

In [None]:

# training_args = TrainingArguments(
#     per_device_train_batch_size=per_device_train_batch_size,
#     max_steps=max_steps,
#     remove_unused_columns=False,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     learning_rate=learning_rate,
#     evaluation_strategy="steps",
#     logging_first_step=True,
#     logging_steps=5,  # match results in blog post
#     eval_steps=500,
#     output_dir="./A5_DPO",
#     optim="rmsprop",
#     warmup_steps=150,
#     report_to=report_to,
#     bf16=True,
#     gradient_checkpointing=gradient_checkpointing,
#     # TODO: uncomment that on the next transformers release
#     # gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
# )



# initialize the DPO trainer

In [17]:

# dpo_trainer = DPOTrainer(
#     model,
#     model_ref,
#     args=training_args,
#     beta=beta,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=tokenizer,
#     max_length=max_length,
#     max_target_length=max_target_length,
#     max_prompt_length=max_prompt_length,
#     generate_during_eval=True,
# )

In [20]:
import torch
import os
import json
from transformers import TrainingArguments
from trl import DPOTrainer



In [26]:
import os

# Define base path for saving files
base_path = "/content/drive/MyDrive/Colab_Notebooks/A5"

# Ensure output directories exist
os.makedirs(os.path.join(base_path, "saved_models"), exist_ok=True)
os.makedirs(os.path.join(base_path, "saved_logs"), exist_ok=True)

# Define hyperparameter configurations for testing
hyperparameter_configs = [
    {"learning_rate": 5e-4, "batch_size": 4, "beta": 0.1, "optimizer": "adamw_torch"},
    {"learning_rate": 1e-3, "batch_size": 8, "beta": 0.2, "optimizer": "rmsprop"},
    {"learning_rate": 2e-3, "batch_size": 8, "beta": 0.3, "optimizer": "adamw_torch"},
]

# Define max_steps for training (controls how long training runs)
max_steps = 500


In [27]:
# Track best model performance
best_model = None
best_eval_loss = float("inf")
best_config = None

# Store results for logging
training_results = []


# Train

for config in hyperparameter_configs:
    print(f"\n Training with config: {config}")

    # Set up training arguments
    training_args = TrainingArguments(
        per_device_train_batch_size=config["batch_size"],
        max_steps=max_steps,
        remove_unused_columns=False,
        gradient_accumulation_steps=1,
        learning_rate=config["learning_rate"],
        evaluation_strategy="steps",
        logging_first_step=True,
        logging_steps=5,  # Log every 5 steps
        eval_steps=500,    # Evaluate every 500 steps
        output_dir=os.path.join(base_path, f"A5_DPO_{config['learning_rate']}_{config['batch_size']}"),
        optim=config["optimizer"],
        warmup_steps=150,
        report_to=None,
        bf16=torch.cuda.is_available(),  # Use bf16 if GPU supports it
        gradient_checkpointing=None,
    )

    # Initialize DPOTrainer
    dpo_trainer = DPOTrainer(
        model,
        model_ref,
        args=training_args,
        beta=config["beta"],
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        max_length=128,  # Adjust max sequence length
        max_target_length=64,
        max_prompt_length=64,
        generate_during_eval=True,
    )

    # Train model
    train_result = dpo_trainer.train()
    #165e1c569ee88c1aeab0a4f5aae6f7b7da27c76b


In [29]:
    # Evaluate model
    eval_metrics = dpo_trainer.evaluate()
    print(f"📊 Evaluation metrics: {eval_metrics}")

    # Save results
    config_result = {
        "config": config,
        "eval_metrics": eval_metrics,
    }
    training_results.append(config_result)

    # Track the best model based on evaluation loss
    if eval_metrics["eval_loss"] < best_eval_loss:
        best_eval_loss = eval_metrics["eval_loss"]
        best_model = dpo_trainer.model
        best_config = config

    # Save model for this run
    model_save_path = os.path.join(base_path, "saved_models", f"dpo_model_{config['learning_rate']}_{config['batch_size']}")
    log_save_path = os.path.join(base_path, "saved_logs", "training_results.json")
    dpo_trainer.save_model(model_save_path)
    print(f"Model saved to {model_save_path}")


Trainer is attempting to log a value of "<wandb.sdk.data_types.table.Table object at 0x7dd8e45c9410>" of type <class 'wandb.sdk.data_types.table.Table'> for key "train/game_log" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


📊 Evaluation metrics: {'eval_loss': 7.654656410217285, 'eval_runtime': 142.723, 'eval_samples_per_second': 85.368, 'eval_steps_per_second': 10.671, 'eval_rewards/chosen': -94.2978286743164, 'eval_rewards/rejected': -95.195068359375, 'eval_rewards/accuracies': 0.5385751724243164, 'eval_rewards/margins': 0.8972288966178894, 'eval_logps/rejected': -476.4890441894531, 'eval_logps/chosen': -479.5237121582031, 'eval_logits/rejected': -13.32876205444336, 'eval_logits/chosen': -13.282124519348145, 'epoch': 0.08207485226526592}
Model saved to /content/drive/MyDrive/Colab_Notebooks/A5/saved_models/dpo_model_0.002_8


In [30]:
# Save all training results
with open(log_save_path, "w") as f:
    json.dump(training_results, f, indent=4)

print(f"\n Training results saved to {log_save_path}")



 Training results saved to /content/drive/MyDrive/Colab_Notebooks/A5/saved_logs/training_results.json


In [31]:
# Save the best-performing model
if best_model:
    best_model_path = os.path.join(base_path, "saved_models", "dpo_best_model")
    best_model.save_pretrained(best_model_path)
    print(f"\n Best model saved at {best_model_path} with config: {best_config}")

# Save best config settings
best_config_path = os.path.join(base_path, "saved_logs", "best_config.json")
with open(best_config_path, "w") as f:
    json.dump(best_config, f, indent=4)

print(f"\n📂 Best model config saved to {best_config_path}")



 Best model saved at /content/drive/MyDrive/Colab_Notebooks/A5/saved_models/dpo_best_model with config: {'learning_rate': 0.002, 'batch_size': 8, 'beta': 0.3, 'optimizer': 'adamw_torch'}

📂 Best model config saved to /content/drive/MyDrive/Colab_Notebooks/A5/saved_logs/best_config.json


In [None]:
# # 165e1c569ee88c1aeab0a4f5aae6f7b7da27c76b

# # Now train your model
# dpo_trainer.train()



Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
500,2.3359,2.013917,-15.25818,-14.933399,0.482026,-0.324781,-245.629364,-250.116531,2.566696,2.52175


Trainer is attempting to log a value of "<wandb.sdk.data_types.table.Table object at 0x7d7e8d1de610>" of type <class 'wandb.sdk.data_types.table.Table'> for key "train/game_log" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=500, training_loss=2.2247859020233154, metrics={'train_runtime': 117.6121, 'train_samples_per_second': 8.503, 'train_steps_per_second': 4.251, 'total_flos': 0.0, 'train_loss': 2.2247859020233154, 'epoch': 0.02051955513604465})

## Testing


In [25]:
# Set model to evaluation mode
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [26]:
def generate_response(prompt, max_tokens=100):
    """
    Generates a response from the fine-tuned model for a given prompt.

    Parameters:
    - prompt (str): The input text for the model.
    - max_tokens (int): The maximum number of new tokens to generate.

    Returns:
    - response (str): The model-generated response.
    """
    try:
        # Format input as a structured dialogue
        formatted_prompt = f"Human: {prompt}\n\nAssistant:"

        # Tokenize input
        input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids

        # Generate response
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_new_tokens=max_tokens,  # Controls response length
                temperature=0.7,  # Adds diversity (lower = more deterministic)
                top_p=0.9,  # Nucleus sampling (higher = more diverse)
                do_sample=True,  # Enables randomness for varied responses
                pad_token_id=tokenizer.eos_token_id,  # Handles padding properly
            )

        # Decode and clean response
        full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        response = full_response.replace(formatted_prompt, "").strip()

        return response
    except Exception as e:
        return f"Error generating response: {str(e)}"


In [29]:
# Example test prompts
test_prompts = [
    "How to study effectively?",
    "Tell me a short motivational story.",
    "What is the best way to learn Python?",
]

# Generate and print responses for each test prompt
for prompt in test_prompts:
    response = generate_response(prompt)
    print(f"\nPrompt: {prompt}\nResponse: {response}\n{'='*80}")



Prompt: How to study effectively?
Response: The premise a most. First's following single simple, The following part in be� to First. First and it from a authorotan game are the following part in the headline. First's first (2 guide named the following need a powerful and the result is the following author that our first the following parents is '

Prompt: Tell me a short motivational story.
Response: The following a exciting with The following series in the following series the premise

Prompt: What is the best way to learn Python?
Response: The simple challenges you their long the series is the author like not their following following hypothesis�o like the following collectionation (A First. 1 man named a author-5. First. First, the case. To
