In [1]:
!pip install -q unsloth trl transformers datasets accelerate bitsandbytes comet_ml --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.8/306.8 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m729.6/729.6 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import os

# Hugging Face Token (optional, only if downloading gated models or pushing)
os.environ["HF_TOKEN"] = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxx"

# Comet ML API key (optional, for experiment tracking)
os.environ["COMET_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxx"

# =======================================
# 3️⃣ Imports
# =======================================
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

import torch
from datasets import load_dataset
from transformers import TextStreamer
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import DPOConfig, DPOTrainer

In [6]:
# =======================================
# Load base model in QLoRA mode
# =======================================

max_seq_length = 2048  # total length prompt+response

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",  # change to any HF model
    max_seq_length=max_seq_length,
    load_in_4bit=True,   # QLoRA
)


==((====))==  Unsloth 2025.8.4: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
# =======================================
# 5️⃣ Add LoRA adapters for training
# =======================================
model = FastLanguageModel.get_peft_model(
    model,
    r=16,               # lower rank to save memory
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
)

Unsloth 2025.8.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
# =======================================
# 6️⃣ Load preference dataset
# Using a small sample for demo
# =======================================
dataset = load_dataset("trl-internal-testing/hh-rlhf-helpful-base-trl-style", split="train")


README.md:   0%|          | 0.00/964 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43835 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2354 [00:00<?, ? examples/s]

In [12]:
EOS_TOKEN = tokenizer.eos_token

def format_chat(messages):
    """
    Convert list of {'role': ..., 'content': ...} dicts into a single dialogue string.
    """
    formatted = ""
    for msg in messages:
        role = msg["role"]
        content = msg["content"]

        if role == "user":
            formatted += f"User: {content}\n"
        elif role == "assistant":
            formatted += f"Assistant: {content}\n"
        else:
            formatted += f"{role.capitalize()}: {content}\n"

    return formatted.strip()

def format_samples(example):
    chosen_text = format_chat(example["chosen"]) + EOS_TOKEN
    rejected_text = format_chat(example["rejected"]) + EOS_TOKEN
    prompt_text = example["prompt"]

    return {
        "prompt": prompt_text,
        "chosen": chosen_text,
        "rejected": rejected_text
    }

dataset = dataset.map(format_samples)
dataset = dataset.train_test_split(test_size=0.05)


Map:   0%|          | 0/43835 [00:00<?, ? examples/s]

In [19]:
dataset['train']

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 41643
})

In [20]:
# =======================================
# 7️⃣ DPO Training
# =======================================
trainer = DPOTrainer(
    model=model,
    ref_model=None,  # use frozen base as reference
    tokenizer=tokenizer,
    beta=0.1,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    max_length=max_seq_length//2,      # prompt+answer
    max_prompt_length=max_seq_length//2, # prompt only
    args=DPOConfig(
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="dpo_model",
        logging_steps=1,
        save_strategy="epoch",
        report_to="none",
    ),
)

Extracting prompt in train dataset (num_proc=2):   0%|          | 0/41643 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/41643 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/41643 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=2):   0%|          | 0/2192 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/2192 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/2192 [00:00<?, ? examples/s]

In [None]:
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 41,643 | Num Epochs = 1 | Total steps = 10,411
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.6931,0.0,0.0,0.0,0.0,-325.023682,-267.546448,-0.402622,-0.447314,0,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-634.695557,-608.648804,-0.426893,-0.458256,No Log,No Log,No Log,No Log
3,0.696,-0.005666,-4.7e-05,0.0,-0.005618,-324.894165,-203.732819,-0.215756,-0.365919,No Log,No Log,No Log,No Log
4,0.6919,0.010101,0.007529,0.5,0.002572,-591.918396,-562.184998,-0.252125,-0.299921,No Log,No Log,No Log,No Log
5,0.6965,0.001722,0.008302,0.25,-0.006581,-388.538513,-480.392029,-0.175203,-0.097295,No Log,No Log,No Log,No Log
6,0.6907,0.018526,0.013607,0.5,0.004919,-477.920105,-460.909485,-0.285446,-0.334634,No Log,No Log,No Log,No Log
7,0.6935,0.019245,0.020013,0.5,-0.000768,-398.13266,-380.368652,-0.397063,-0.347084,No Log,No Log,No Log,No Log
8,0.6872,0.030431,0.018347,0.5,0.012083,-359.774841,-368.299225,-0.273062,-0.39425,No Log,No Log,No Log,No Log
9,0.6934,0.02663,0.027099,0.5,-0.000469,-267.890778,-330.256287,-0.387924,-0.375438,No Log,No Log,No Log,No Log
10,0.6986,0.001127,0.011995,0.0,-0.010869,-201.720078,-228.612366,-0.40548,-0.372467,No Log,No Log,No Log,No Log
