In [1]:
# Setup

!pip -q install bitsandbytes peft datasets trl accelerate transformers tqdm matplotlib

import os, torch, random, json
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType
from trl import DPOTrainer, DPOConfig
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount("/content/drive")
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Running on: cuda


In [2]:
# Paths and seeds

BASE_INST_MODEL = "/content/drive/MyDrive/llm_project/inst_qlora_pytorch_gpt2_medium_dolly"
OUT_DIR = "/content/drive/MyDrive/llm_project/dpo_rlhf_gpt2m"
os.makedirs(OUT_DIR, exist_ok=True)

seed = 42
random.seed(seed); torch.manual_seed(seed)

<torch._C.Generator at 0x7c85c40fdf90>

In [3]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_INST_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
# Load Policy + Ref Models (QLoRA)

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

policy_model = AutoModelForCausalLM.from_pretrained(
    BASE_INST_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto",
)
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["c_attn", "c_proj"],
    task_type=TaskType.CAUSAL_LM,
)
policy_model = get_peft_model(policy_model, lora_cfg)
policy_model.print_trainable_parameters()

ref_model = AutoModelForCausalLM.from_pretrained(
    BASE_INST_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable params: 4,325,376 || all params: 359,148,544 || trainable%: 1.2043


In [5]:
# Load dataset

raw = load_dataset("Anthropic/hh-rlhf")
train = raw["train"].shuffle(seed=seed).select(range(4000))
val   = raw["test"].shuffle(seed=seed).select(range(400))

# DPOTrainer can infer implicit prompts from chosen/rejected pairs
print("Columns:", train.column_names)
print("Train size:", len(train), "Val size:", len(val))


Columns: ['chosen', 'rejected']
Train size: 4000 Val size: 400


In [6]:
# Initialize DPOTrainer

dpo_config = DPOConfig(
    output_dir=OUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    num_train_epochs=1,
    logging_steps=25,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=100,
    warmup_ratio=0.05,
    remove_unused_columns=False,
    beta=0.1,
    report_to="none",               # disable wandb logging
    fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
)

In [7]:
# Trainer

trainer = DPOTrainer(
    model=policy_model,
    ref_model=ref_model,
    args=dpo_config,
    train_dataset=train,
    eval_dataset=val,
    processing_class=tokenizer,
)

Extracting prompt in train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1687 > 1024). Running this sequence through the model will result in indexing errors


Extracting prompt in eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

In [8]:
# Train + save

trainer.train()

trainer.model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print("Model saved to:", OUT_DIR)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.9179,0.97325,-1.516138,-1.391199,0.4675,-0.124939,-168.040344,-181.980408,-100.45507,-101.052269
200,0.9072,0.942582,-1.366279,-1.234098,0.46,-0.132181,-166.541779,-180.409393,-99.663811,-100.217873
300,0.8393,0.900298,-1.224969,-1.111575,0.47,-0.113394,-165.128677,-179.184158,-99.125946,-99.612755
400,0.8392,0.884806,-1.163852,-1.056481,0.4625,-0.10737,-164.517502,-178.63324,-98.930817,-99.376541
500,0.9248,0.879402,-1.137846,-1.029467,0.46,-0.108379,-164.257446,-178.363083,-98.833519,-99.262619


Model saved to: /content/drive/MyDrive/llm_project/dpo_rlhf_gpt2m


In [9]:
# Evaluate

metrics = trainer.evaluate()

print("Eval metrics:", metrics)

Eval metrics: {'eval_loss': 0.8794019818305969, 'eval_runtime': 257.9098, 'eval_samples_per_second': 1.551, 'eval_steps_per_second': 0.388, 'eval_rewards/chosen': -1.1378464698791504, 'eval_rewards/rejected': -1.0294673442840576, 'eval_rewards/accuracies': 0.46000000834465027, 'eval_rewards/margins': -0.10837908089160919, 'eval_logps/chosen': -164.2574462890625, 'eval_logps/rejected': -178.3630828857422, 'eval_logits/chosen': -98.8335189819336, 'eval_logits/rejected': -99.26261901855469, 'epoch': 1.0}


In [10]:
# Export core dependencies with installed versions

import importlib.metadata as md
import os

core_packages = [
    "torch",
    "transformers",
    "datasets",
    "peft",
    "bitsandbytes",
    "accelerate",
    "trl",
    "tqdm",
    "matplotlib"
]

output_file = os.path.join(os.getcwd(), "requirements.txt")

with open(output_file, "w") as f:
    for pkg in core_packages:
        try:
            version = md.version(pkg)
            f.write(f"{pkg}=={version}\n")
        except md.PackageNotFoundError:
            f.write(f"# {pkg} not installed\n")

print(f"requirements.txt created at: {output_file}\n")
!cat requirements.txt

requirements.txt created at: /content/requirements.txt

torch==2.8.0+cu126
transformers==4.57.1
datasets==4.0.0
peft==0.17.1
bitsandbytes==0.48.2
accelerate==1.11.0
trl==0.25.0
tqdm==4.67.1
matplotlib==3.10.0
