In [2]:
from datasets import load_dataset

# Load the Dahoas/full-hh-rlhf dataset
dataset = load_dataset("Dahoas/full-hh-rlhf")


In [1]:
import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 112052
    })
    test: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 12451
    })
})


In [11]:
print(dataset["train"].column_names)
print(dataset["train"][0])

['prompt', 'response', 'chosen', 'rejected']
{'prompt': '\n\nHuman: Should you buy a case to protect your cell phone?\n\nAssistant: It depends on your circumstances.  If you carry your phone in a pocket or a purse then you probably want a case.  But if you only need a phone for quick interactions, a case may actually cause more harm than good.  What do you need the phone for?  Are you a parent, or do you work from home?\n\nHuman: What harm could it do?\n\nAssistant: A phone case can damage the screen, for one thing.  It can also get you in trouble if you have your phone turned off for some reason.  Then you will turn it back on and it won’t do anything.  If you can afford to replace it, then you need a case to protect it.  The problem is that most people aren’t able to afford to replace their phones all the time.\n\nHuman: Thanks for letting me know.\n\nAssistant:', 'response': ' You’re welcome.', 'chosen': ' You’re welcome.', 'rejected': ' It sounds like you’ve got the basics down.  A

In [3]:
def preprocess_data(example):
    return {
        "prompt": example["prompt"],
        "chosen": example["chosen"],
        "rejected": example["rejected"]
    }

# Apply preprocessing properly
processed_dataset = dataset.map(preprocess_data, remove_columns=["response"])


In [16]:
print(processed_dataset["train"][0])


{'prompt': '\n\nHuman: Should you buy a case to protect your cell phone?\n\nAssistant: It depends on your circumstances.  If you carry your phone in a pocket or a purse then you probably want a case.  But if you only need a phone for quick interactions, a case may actually cause more harm than good.  What do you need the phone for?  Are you a parent, or do you work from home?\n\nHuman: What harm could it do?\n\nAssistant: A phone case can damage the screen, for one thing.  It can also get you in trouble if you have your phone turned off for some reason.  Then you will turn it back on and it won’t do anything.  If you can afford to replace it, then you need a case to protect it.  The problem is that most people aren’t able to afford to replace their phones all the time.\n\nHuman: Thanks for letting me know.\n\nAssistant:', 'response': ' You’re welcome.', 'chosen': ' You’re welcome.', 'rejected': ' It sounds like you’ve got the basics down.  Any further questions or concerns?  You can se

In [17]:
# Tokenize

In [4]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    HfArgumentParser, 
    TrainingArguments
)

from typing import Dict, Optional

# Implements Direct Preference Optimization for fine-tuning models based on human preference data
from trl import DPOTrainer

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Model configuration
model_name_or_path = "gpt2"
ignore_bias_buffers = False

# Load pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

if ignore_bias_buffers:
    # Ignore boolean bias buffers in torch distributed training
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

# Load reference model for DPO training
model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)

# Load tokenizer and ensure padding token exists
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [6]:
from datasets import load_dataset

def extract_anthropic_prompt(prompt_and_response):
    """Extract the anthropic prompt from a prompt and response pair."""
    search_term = "\n\nAssistant:"
    search_term_idx = prompt_and_response.rfind(search_term)
    if search_term_idx == -1:
        return None  # Return None if no valid prompt structure found
    return prompt_and_response[: search_term_idx + len(search_term)]

def get_hh(split: str, sanity_check: bool = False, cache_dir: str = None):
    """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format."""
    dataset = load_dataset("Dahoas/full-hh-rlhf", split=split, cache_dir=cache_dir)

    # Limit dataset size for quick testing
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    def split_prompt_and_responses(sample):
        prompt = extract_anthropic_prompt(sample["chosen"])
        if prompt is None:
            return None  # Skip invalid samples

        return {
            "prompt": prompt,
            "chosen": sample["chosen"][len(prompt) :].strip(),
            "rejected": sample["rejected"][len(prompt) :].strip(),
        }

    processed_dataset = dataset.map(split_prompt_and_responses, remove_columns=["chosen", "rejected"])
    
    return processed_dataset




In [26]:
# Example usage
processed_dataset = get_hh("train", sanity_check=True)
print(processed_dataset[0])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'prompt': '\n\nHuman: Should you buy a case to protect your cell phone?\n\nAssistant: It depends on your circumstances.  If you carry your phone in a pocket or a purse then you probably want a case.  But if you only need a phone for quick interactions, a case may actually cause more harm than good.  What do you need the phone for?  Are you a parent, or do you work from home?\n\nHuman: What harm could it do?\n\nAssistant: A phone case can damage the screen, for one thing.  It can also get you in trouble if you have your phone turned off for some reason.  Then you will turn it back on and it won’t do anything.  If you can afford to replace it, then you need a case to protect it.  The problem is that most people aren’t able to afford to replace their phones all the time.\n\nHuman: Thanks for letting me know.\n\nAssistant:', 'response': ' You’re welcome.', 'chosen': ' You’re welcome.', 'rejected': ' It sounds like you’ve got the basics down.  Any further questions or concerns?  You can se

In [7]:
sanity_check = True
train_dataset = get_hh("train", sanity_check=sanity_check)
eval_dataset = get_hh("test", sanity_check=sanity_check)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
train_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 1000
})

In [10]:
eval_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 1000
})

In [None]:
learning_rate = 5e-5  
per_device_train_batch_size = 8  
gradient_accumulation_steps = 2  
max_length = 512  
max_prompt_length = 128  # Standard for DPO training
max_target_length = 128  # Standard response length
label_pad_token_id = -100  # For ignored token loss
max_steps = 100  
sanity_check = True 
report_to = "wandb" 
gradient_checkpointing = True  # Saves memory
beta = 0.1  # Standard for DPO


In [9]:
from trl import DPOTrainer, DPOConfig


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig

training_args = DPOConfig(
    output_dir="./gpt2_dpo",
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=2,  
    gradient_accumulation_steps=8, 
    learning_rate=5e-5,
    num_train_epochs=3,
    max_steps=1000,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_steps=50,
    fp16=True,  # Mixed Precision
    gradient_checkpointing=True,  # Saves memory
    padding_value=tokenizer.pad_token_id,
    max_prompt_length=384,  
    max_completion_length=384,  
)


# ✅ 4. Initialize DPOTrainer
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer  
)





In [14]:
train_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 1000
})

In [11]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,0.7936,0.754662,-0.187384,-0.3503,0.559,0.162916,-229.084061,-214.057678,-108.798828,-109.278664
2,0.328,0.845747,-0.649858,-0.906046,0.568,0.256188,-233.708817,-219.615128,-111.854401,-112.525116
3,0.1234,1.139908,-2.600197,-2.979456,0.582,0.379259,-253.212219,-240.349228,-126.323257,-127.757065
4,0.0116,1.207658,-3.487255,-3.968986,0.579,0.481731,-262.082794,-250.244568,-120.031448,-120.94474
5,0.0061,1.588167,-5.523031,-6.025922,0.573,0.502891,-282.440521,-270.813873,-124.085159,-125.178078
6,0.0074,1.638352,-5.922364,-6.440679,0.566,0.518315,-286.433868,-274.961456,-121.233566,-122.241165
7,0.0072,1.651491,-6.012239,-6.533665,0.565,0.521427,-287.332611,-275.891327,-117.790222,-118.703743
8,0.0062,1.708938,-6.328568,-6.852242,0.565,0.523674,-290.495911,-279.077087,-115.711937,-116.539482
9,0.0062,1.732372,-6.474491,-7.001331,0.565,0.52684,-291.955139,-280.567993,-113.815544,-114.587425
10,0.0062,1.758846,-6.628852,-7.157539,0.567,0.528687,-293.498749,-282.130096,-112.411179,-113.151146


TrainOutput(global_step=1000, training_loss=0.07009943208098411, metrics={'train_runtime': 1648.3889, 'train_samples_per_second': 9.706, 'train_steps_per_second': 0.607, 'total_flos': 0.0, 'train_loss': 0.07009943208098411, 'epoch': 15.88})

In [12]:
# Save the model and tokenizer locally
output_dir = "./gpt2_dpo_finetuned"

trainer.save_model(output_dir)  # Saves the model
tokenizer.save_pretrained(output_dir)  # Saves the tokenizer


('./gpt2_dpo_finetuned/tokenizer_config.json',
 './gpt2_dpo_finetuned/special_tokens_map.json',
 './gpt2_dpo_finetuned/vocab.json',
 './gpt2_dpo_finetuned/merges.txt',
 './gpt2_dpo_finetuned/added_tokens.json',
 './gpt2_dpo_finetuned/tokenizer.json')