# Preference-Based Fine-Tuning: DPO vs ORPO

This notebook provides templates and examaples for two popular preference-based fine-tuning methods:
- **DPO (Direct Preference Optimizatioan)**: Optimizes language models directly on preference data without requiring a reward model
- **ORPO (Odds Ratio Preference Optimization)**: A reference-free approach that combines SFT and preference alignment in a single stage

## Overview

Preference-based fine-tuning methods aim to align language models with human preferences by training on pairs of responses where one is preferred over another. These methods have become crucial for creating more helpful, harmless, and honest AI systems.


In [None]:
# Model and Training Configuration
class Config:
    # Model settings
    model_name = "microsoft/DialoGPT-medium"  # Change to your preferred base model
    model_max_length = 512
    
    # LoRA settings
    lora_r = 16
    lora_alpha = 32
    lora_dropout = 0.1
    lora_target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"]
    
    # Training settings
    learning_rate = 5e-5
    batch_size = 4
    gradient_accumulation_steps = 4
    num_epochs = 3
    warmup_steps = 100
    logging_steps = 10
    save_steps = 500
    eval_steps = 500
    
    # DPO specific
    dpo_beta = 0.1  # Temperature parameter for DPO
    
    # ORPO specific
    orpo_alpha = 1.0  # Weight for the SFT loss
    orpo_beta = 0.1   # Weight for the preference loss
    
    # Output directories
    output_dir = "./results"
    logging_dir = "./logs"
    
    # Quantization (for memory efficiency)
    use_4bit = True
    bnb_4bit_compute_dtype = torch.float16
    bnb_4bit_use_double_quant = True
    bnb_4bit_quant_type = "nf4"

config = Config()


In [None]:
# Install required packages
# !pip install transformers datasets torch accelerate trl peft bitsandbytes wandb

import torch
import torch.nn.functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments,
    BitsAndBytesConfig
)
from datasets import Dataset, load_dataset
from trl import DPOTrainer, ORPOTrainer
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple
import logging
import wandb

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


## Data Preparation

Both DPO and ORPO require preference datasets with the following structure:
- **prompt**: The input/question
- **chosen**: The preferred response
- **rejected**: The less preferred response


In [None]:
def create_sample_preference_dataset() -> Dataset:
    """
    Create a sample preference dataset for demonstration.
    In practice, you would load your actual preference data.
    """
    sample_data = [
        {
            "prompt": "What's the best way to learn programming?",
            "chosen": "Start with the basics like variables and control structures, then practice with small projects. Choose a beginner-friendly language like Python, and work through tutorials while building your own projects.",
            "rejected": "Just memorize all the syntax and you'll be fine. Programming is just about knowing all the commands."
        },
        {
            "prompt": "How do I stay motivated while learning?",
            "chosen": "Set small, achievable goals and celebrate your progress. Find a community of learners, work on projects you're passionate about, and remember that everyone learns at their own pace.",
            "rejected": "Just force yourself to study 12 hours a day. If you're not exhausted, you're not trying hard enough."
        },
        {
            "prompt": "What's the most important skill for a programmer?",
            "chosen": "Problem-solving is crucial. Programming is fundamentally about breaking down complex problems into smaller, manageable pieces and finding efficient solutions.",
            "rejected": "Typing speed. The faster you can type, the better programmer you are."
        }
    ]
    
    return Dataset.from_list(sample_data)

def load_preference_dataset(dataset_name: Optional[str] = None) -> Dataset:
    """
    Load a preference dataset. You can use popular datasets like:
    - Anthropic/hh-rlhf
    - Intel/orca_dpo_pairs
    - argilla/ultrafeedback-binarized-preferences
    """
    if dataset_name:
        # Load from HuggingFace Hub
        dataset = load_dataset(dataset_name, split="train")
        return dataset
    else:
        # Use sample data for demonstration
        return create_sample_preference_dataset()

def preprocess_dataset(dataset: Dataset, tokenizer) -> Dataset:
    """Preprocess the dataset for training"""
    def tokenize_function(examples):
        # Tokenize prompts, chosen, and rejected responses
        prompts = [f"Human: {prompt}\nAssistant: " for prompt in examples["prompt"]]
        
        model_inputs = {}
        model_inputs["prompt"] = prompts
        model_inputs["chosen"] = [f"{prompt}{chosen}" for prompt, chosen in zip(prompts, examples["chosen"])]
        model_inputs["rejected"] = [f"{prompt}{rejected}" for prompt, rejected in zip(prompts, examples["rejected"])]
        
        return model_inputs
    
    return dataset.map(tokenize_function, batched=True)

# Load and preprocess data
print("Loading preference dataset...")
train_dataset = load_preference_dataset()
print(f"Dataset size: {len(train_dataset)}")
print("Sample entry:")
print(train_dataset[0])
