<a href="https://colab.research.google.com/github/SanjaySaatyaki/hf_smol_course/blob/main/Preference_Alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install "transformers>=4.56.1" "trl>=0.23.0" "datasets>=4.1.0" "torch>=2.8.0"
!pip install "accelerate>=1.10.1" "peft>=0.17.0" "trackio"

Collecting trl>=0.23.0
  Downloading trl-0.23.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=4.1.0
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets>=4.1.0)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading trl-0.23.1-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.1.1-py3-none-any.whl (503 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets, trl
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
   

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig
import json

# Check available device
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"
    print("Using Apple MPS")
else:
    device = "cpu"
    print("Using CPU - you will need HF Jobs with GPU for actual training")

# Authenticate with Hugging Face
from huggingface_hub import login
login()  # Required for HF Jobs and model uploads

Using CUDA GPU: Tesla T4
GPU memory: 15.8GB


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
hf_token = ""

In [None]:
# Load a preference dataset to understand the format
dataset = load_dataset("Anthropic/hh-rlhf", split="train", token=hf_token)
print(f"Dataset size: {len(dataset)}")
print("Dataset features:", dataset.features.keys())

# Examine a preference pair
sample = dataset[0]
print(f"\nChosen (Preferred): {sample['chosen'][:200]}...")
print(f"\nRejected (Non-preferred): {sample['rejected'][:200]}...")

# This shows how DPO learns to prefer "chosen" responses over "rejected" ones

In [None]:
# Load a small subset for local testing
small_dataset = dataset.select(range(1000))

# Load SmolLM3-3B-Instruct model
model_name = "sanjay-saatyaki/smol-train"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map=device,  # Explicitly set device_map to the selected device
    token=hf_token
)
tokenizer = AutoTokenizer.from_pretrained(model_name,token=hf_token)
tokenizer.pad_token = tokenizer.eos_token

# Configure DPO training for local testing
training_args = DPOConfig(
        # Core DPO parameters
        beta=0.1,                           # Preference optimization strength
        max_prompt_length=512,              # Maximum prompt length
        max_length=1024,                    # Maximum total sequence length

        # Training configuration
        learning_rate=5e-7,                 # Lower than SFT for stability
        per_device_train_batch_size=2,      # Adjust for GPU memory
        gradient_accumulation_steps=8,      # Effective batch size = 16
        max_steps=1000,                     # Sufficient for good alignment

        # Optimization
        warmup_steps=100,
        lr_scheduler_type="cosine",
        gradient_checkpointing=True,        # Memory efficiency
        bf16=True,                          # Mixed precision

        # Logging and saving
        logging_steps=50,
        save_steps=250,
        output_dir="./smollm3-dpo-aligned",

        # Hub integration
        push_to_hub=True,
        hub_model_id="your-username/smollm3-dpo-aligned",  # Change this!
        report_to="trackio",

        # Remove unused columns for cleaner training
        remove_unused_columns=False,
    )

# Create trainer (but don't train yet - save resources for HF Jobs)
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=small_dataset,
    tokenizer=tokenizer, # Use tokenizer instead of processing_class
)

print("Local DPO trainer configured successfully!")
print("Ready to scale to HF Jobs for full training...")

In [15]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True