In [None]:
# =============================================================================
# COLAB 3: REINFORCEMENT LEARNING WITH DPO (Direct Preference Optimization)
# - Robust HH-RLHF parsing to (prompt, chosen, rejected)
# - Hard token truncation (prompt≤256, responses≤256) to avoid >512 errors
# - W&B disabled by default
# - 4-bit + LoRA r=16 with Unsloth DPO patch
# =============================================================================

# Cell 1: Install Dependencies
# -----------------------------------------------------------------------------
print("📦 Installing Unsloth and DPO dependencies...")
!pip install -q unsloth
!pip install -q --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q "trl>=0.7.10" datasets transformers accelerate bitsandbytes

print("✅ Installation complete!")

# Cell 2: Import Libraries & Env
# -----------------------------------------------------------------------------
import os, torch, typing
from unsloth import FastLanguageModel, PatchDPOTrainer
from datasets import load_dataset
from transformers import set_seed
from trl import DPOTrainer, DPOConfig

# Make W&B optional (off by default)
os.environ["WANDB_DISABLED"] = "true"
# If you ever hit dynamo/fused-loss traces, uncomment:
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

# Patch TRL's DPOTrainer for Unsloth compatibility
PatchDPOTrainer()

print(f"🔥 PyTorch: {torch.__version__}")
print(f"🎮 CUDA: {torch.cuda.is_available()}  |  GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Cell 3: Configuration
# -----------------------------------------------------------------------------
max_seq_length = 512
max_prompt_tokens = 256
max_response_tokens = 256

dtype = None
load_in_4bit = True

# LoRA for DPO
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05

# DPO
beta = 0.1             # strength of preference optimization

# Training
batch_size = 2
gradient_accumulation_steps = 4
num_train_epochs = 1    # ignored when max_steps > 0
learning_rate = 5e-5    # lower LR = more stable DPO
max_steps = 50
seed = 3407

set_seed(seed)

print(f"""
🔧 Config:
 • LoRA r/α/dropout: {lora_r}/{lora_alpha}/{lora_dropout}
 • DPO beta: {beta}
 • Max seq: {max_seq_length} (prompt≤{max_prompt_tokens}, response≤{max_response_tokens})
 • BS: {batch_size} | Grad Accum: {gradient_accumulation_steps}
 • LR: {learning_rate} | Steps: {max_steps} | Seed: {seed}
""")

# Cell 4: Load Base Model
# -----------------------------------------------------------------------------
print("📥 Loading SmolLM2-135M...")
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Ensure PAD/EOS/truncation are explicit & aligned
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = max_seq_length
tokenizer.truncation_side = "right"
tokenizer.padding_side = "right"

print("✅ Model loaded.")

# Cell 5: Apply LoRA
# -----------------------------------------------------------------------------
print("🔧 Applying LoRA adapters for DPO...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_r,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",      # attention
        "gate_proj", "up_proj", "down_proj",         # MLP
    ],
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=seed,
)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"✅ LoRA applied: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M "
      f"({100*trainable_params/total_params:.2f}% trainable)")

# Cell 6: Load HH-RLHF dataset (preference pairs)
# -----------------------------------------------------------------------------
print("📚 Loading Anthropic HH-RLHF subset (500 pairs)...")
raw = load_dataset("Anthropic/hh-rlhf", split="train[:500]")
print("✅ Loaded:", len(raw))

# Cell 7: Convert HH-RLHF into (prompt, chosen, rejected)
# -----------------------------------------------------------------------------
# Each 'chosen'/'rejected' is a conversation string that includes lines like:
# "Human: ...\n\nAssistant: ...". We'll:
#  • extract text before first "Assistant:" as the prompt (ending with "Assistant:")
#  • extract the first assistant reply as the response
#  • standardize prompt to always end with "Assistant:" (so generation continues naturally)

def split_conv(sample_text: str):
    # Returns (prompt_with_assistant_tag, assistant_first_reply) or (None, None)
    parts = sample_text.split("Assistant:")
    if len(parts) < 2:
        return None, None
    prompt_prefix = parts[0].strip()
    reply = parts[1].strip()
    # Ensure prompt starts with "Human:" and ends with "Assistant:"
    # If the dataset segment lacks "Human:", we keep as-is but still add "Assistant:".
    if not prompt_prefix.endswith("Assistant:"):
        prompt = prompt_prefix + "\nAssistant:"
    else:
        prompt = prompt_prefix
    return prompt, reply

def format_hh_to_dpo(examples):
    prompts, chosens, rejecteds = [], [], []
    for c, r in zip(examples["chosen"], examples["rejected"]):
        p_c, resp_c = split_conv(c)
        p_r, resp_r = split_conv(r)
        if p_c and resp_c and p_r and resp_r:
            # use the prompt from chosen (they should match semantically)
            prompts.append(p_c)
            chosens.append(resp_c)
            rejecteds.append(resp_r)
    return {"prompt": prompts, "chosen": chosens, "rejected": rejecteds}

print("🔄 Parsing HH-RLHF to DPO format...")
dpo_ds = raw.map(format_hh_to_dpo, batched=True, remove_columns=raw.column_names)
dpo_ds = dpo_ds.filter(lambda x: len(x["prompt"]) > 0 and len(x["chosen"]) > 0 and len(x["rejected"]) > 0)
print("✅ Parsed:", len(dpo_ds))

# Cell 8: Hard truncate by token count (prompt≤256, response≤256)
# -----------------------------------------------------------------------------
def truncate_by_tokens(text: str, max_tokens: int) -> str:
    toks = tokenizer(text, add_special_tokens=False)["input_ids"]
    if len(toks) <= max_tokens:
        return text
    toks = toks[:max_tokens]
    return tokenizer.decode(toks, skip_special_tokens=True)

def truncate_record(batch):
    prompts, chosens, rejecteds = [], [], []
    for p, c, r in zip(batch["prompt"], batch["chosen"], batch["rejected"]):
        p_t = truncate_by_tokens(p, max_prompt_tokens)
        c_t = truncate_by_tokens(c, max_response_tokens)
        r_t = truncate_by_tokens(r, max_response_tokens)
        # keep non-empty after truncation
        if p_t.strip() and c_t.strip() and r_t.strip():
            prompts.append(p_t)
            chosens.append(c_t)
            rejecteds.append(r_t)
    return {"prompt": prompts, "chosen": chosens, "rejected": rejecteds}

print("✂️ Truncating by tokens to ensure ≤512 combined...")
dpo_ds = dpo_ds.map(truncate_record, batched=True, remove_columns=dpo_ds.column_names)
print("🔎 Example after truncation:")
print("Prompt:", dpo_ds[0]["prompt"][:200], "...")
print("Chosen:", dpo_ds[0]["chosen"][:200], "...")
print("Rejected:", dpo_ds[0]["rejected"][:200], "...")

# Cell 9: DPO training configuration
# -----------------------------------------------------------------------------
print("⚙️ Building DPOConfig...")
args = DPOConfig(
    output_dir="./dpo_finetuned_smollm2",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=5,
    max_steps=max_steps,                 # takes precedence over epochs
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_bnb_8bit",             # 4-bit friendly
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=seed,
    save_strategy="steps",
    save_steps=25,
    beta=beta,                          # DPO-specific
    report_to=[] if os.environ.get("WANDB_DISABLED","true").lower()=="true" else ["wandb"],
)

print(f"✅ DPO config ready (beta={beta})")

# Cell 10: Initialize DPO Trainer
# -----------------------------------------------------------------------------
print("🏋️ Initializing DPOTrainer...")
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,               # Unsloth handles reference model internally via PatchDPOTrainer
    args=args,
    train_dataset=dpo_ds,
    tokenizer=tokenizer,
    max_length=max_seq_length,    # total max sequence length (prompt+response)
    max_prompt_length=max_prompt_tokens,
)
print("✅ DPOTrainer ready!")

# Cell 11: Train
# -----------------------------------------------------------------------------
print("🚀 Starting DPO training...")
print("=" * 60)
train_out = dpo_trainer.train()
metrics = train_out.metrics or {}
print("=" * 60)

print("✅ DPO training complete!")
print("📊 Stats:")
print("   • Steps:", metrics.get("train_steps", metrics.get("global_step", "N/A")))
print("   • Train loss:", metrics.get("train_loss", "N/A"))
print("   • Time (s):", metrics.get("train_runtime", "N/A"))

# Cell 12: Save adapters and merged model
# -----------------------------------------------------------------------------
print("💾 Saving adapters...")
model.save_pretrained("smollm2_dpo_adapters")
tokenizer.save_pretrained("smollm2_dpo_adapters")
print("✅ Adapters → ./smollm2_dpo_adapters")

print("\n💾 Saving merged model (optional)...")
merged_ok = False
try:
    # Preferred helper when available:
    model.save_pretrained_merged(
        "smollm2_dpo_merged",
        tokenizer,
        save_method="merged_16bit",
    )
    merged_ok = True
except Exception as e:
    print("   save_pretrained_merged unavailable, trying manual merge:", repr(e))
    try:
        FastLanguageModel.merge_lora_weights(model)
        model.save_pretrained("smollm2_dpo_merged")
        tokenizer.save_pretrained("smollm2_dpo_merged")
        merged_ok = True
    except Exception as e2:
        print("   Manual merge failed (not critical):", repr(e2))

print("✅ Merged → ./smollm2_dpo_merged" if merged_ok else "ℹ️ Skipping merge; adapters saved and usable.")

# Cell 13: Inference — compare responses
# -----------------------------------------------------------------------------
print("\n🧪 Inference — DPO-trained model\n")
from torch import inference_mode
from unsloth import FastLanguageModel as _FLM

_FLM.for_inference(model)

def infer(prompt, max_new_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    with inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # remove original prompt prefix if present
    if text.startswith(prompt):
        text = text[len(prompt):].strip()
    return text

tests = [
    "Human: How can I improve my productivity?\n\nAssistant:",
    "Human: What's the best way to learn programming?\n\nAssistant:",
    "Human: Should I exercise regularly?\n\nAssistant:",
]

print("=" * 60)
print("INFERENCE RESULTS (DPO-Trained)")
print("=" * 60)
for i, p in enumerate(tests, 1):
    print(f"\n[Test {i}]")
    print("Prompt:", p.split("Assistant:")[0].strip())
    print("Response:", infer(p))
    print("-" * 60)

# Cell 14: Summary
# -----------------------------------------------------------------------------
print("""
╔════════════════════════════════════════════════════════════╗
║                DPO TRAINING COMPLETE — SUMMARY             ║
╚════════════════════════════════════════════════════════════╝
• Base: HuggingFaceTB/SmolLM2-135M-Instruct
• Method: DPO (β = 0.1), LoRA r=16 over attention+MLP
• Data: 500 HH-RLHF preference pairs
• Safety: hard token truncation (prompt≤256, response≤256, total≤512)
• Saved:
    - Adapters: ./smollm2_dpo_adapters
    - Merged (optional): ./smollm2_dpo_merged
• Tips:
    - Try β∈{0.1, 0.3, 0.5}
    - Increase steps and data for better alignment
    - Keep LR ≤ 5e-5 for stability
""")


📦 Installing Unsloth and DPO dependencies...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.7/348.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.6/273.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:



🦥 Unsloth Zoo will now patch everything to make training faster!
🔥 PyTorch: 2.8.0+cu126
🎮 CUDA: True  |  GPU: Tesla T4

🔧 Config:
 • LoRA r/α/dropout: 16/16/0.05
 • DPO beta: 0.1
 • Max seq: 512 (prompt≤256, response≤256)
 • BS: 2 | Grad Accum: 4
 • LR: 5e-05 | Steps: 50 | Seed: 3407

📥 Loading SmolLM2-135M...
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

HuggingFaceTB/SmolLM2-135M-Instruct does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


✅ Model loaded.
🔧 Applying LoRA adapters for DPO...


Unsloth 2025.10.12 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ LoRA applied: 4.88M / 86.32M (5.66% trainable)
📚 Loading Anthropic HH-RLHF subset (500 pairs)...


README.md: 0.00B [00:00, ?B/s]

harmless-base/train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

helpful-base/train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

helpful-online/train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

helpful-rejection-sampled/train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

harmless-base/test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

helpful-base/test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

helpful-online/test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

helpful-rejection-sampled/test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

✅ Loaded: 500
🔄 Parsing HH-RLHF to DPO format...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

✅ Parsed: 500
✂️ Truncating by tokens to ensure ≤512 combined...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

🔎 Example after truncation:
Prompt: Human: What are some cuss words in english?
Assistant: ...
Chosen: Here’s an incomplete list.

Ass, dick, bugger, crap, fuck, shit, bitch, turd, shithead, shitbag, scrotum, cunt, whore, fucker, shit-eating, cum, cumbucket, fucknugget, butthole, poop, jackass, cocksuc ...
Rejected: Here’s an incomplete list.

Ass, dick, bugger, crap, fuck, shit, bitch, turd, shithead, shitbag, scrotum, cunt, whore, fucker, shit-eating, cum, cumbucket, fucknugget, butthole, poop, jackass, cocksuc ...
⚙️ Building DPOConfig...
✅ DPO config ready (beta=0.1)
🏋️ Initializing DPOTrainer...


Extracting prompt in train dataset (num_proc=6):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=6):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=6):   0%|          | 0/500 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


✅ DPOTrainer ready!
🚀 Starting DPO training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,399,488 (3.50% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
5,0.693,-0.000586,-0.000819,0.1,0.000233,-147.566376,-176.306915,4.80432,4.90616,0,0,0
10,0.6931,-0.001436,-0.001557,0.15,0.000121,-149.836624,-166.352753,4.809131,4.70404,No Log,No Log,No Log
15,0.6925,0.000931,-0.000457,0.125,0.001388,-160.687378,-171.602203,4.90753,4.995065,No Log,No Log,No Log
20,0.6938,-0.000421,0.000885,0.1,-0.001306,-141.577316,-137.22345,3.844711,4.061056,No Log,No Log,No Log
25,0.6922,0.005132,0.003206,0.15,0.001926,-156.439713,-156.138336,4.933416,4.808567,No Log,No Log,No Log
30,0.6913,0.003636,-0.000268,0.275,0.003904,-159.318024,-188.629303,5.008609,5.307027,No Log,No Log,No Log
35,0.6926,0.005938,0.004733,0.15,0.001205,-156.480682,-160.336899,5.330862,5.30291,No Log,No Log,No Log
40,0.6916,-0.002783,-0.005985,0.225,0.003201,-159.917725,-166.511169,5.101692,5.076849,No Log,No Log,No Log
45,0.6938,-0.005918,-0.004681,0.075,-0.001237,-173.456787,-181.8741,4.774917,4.766663,No Log,No Log,No Log
50,0.6927,0.003544,0.002537,0.1,0.001006,-167.43335,-183.016479,4.936091,4.906396,No Log,No Log,No Log


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ DPO training complete!
📊 Stats:
   • Steps: N/A
   • Train loss: 0.6926441049575806
   • Time (s): 148.3295
💾 Saving adapters...
✅ Adapters → ./smollm2_dpo_adapters

💾 Saving merged model (optional)...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `smollm2_dpo_merged`: 100%|██████████| 1/1 [00:00<00:00,  2.27it/s]


Successfully copied all 1 files from cache to `smollm2_dpo_merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:00<00:00, 7013.89it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


Unsloth: Merge process complete. Saved to `/content/smollm2_dpo_merged`
✅ Merged → ./smollm2_dpo_merged

🧪 Inference — DPO-trained model

INFERENCE RESULTS (DPO-Trained)

[Test 1]
Prompt: Human: How can I improve my productivity?
Response: I'd say that one of the most common distractions is when we're not making progress. That's a tough one to overcome. To improve your productivity, I'd recommend setting a goal, breaking down the task into smaller chunks, and setting specific goals, such as completing a project, working on a project, or achieving a deadline. This helps you stay focused and make progress on the task at hand. Additionally, setting a timer, and doing the task that's most challenging or frustrating, can help you stay motivated. Whether it's reading a book, taking a class, or doing some homework, setting a timer and keeping at it, you can achieve your goal.

It's also important to prioritize your tasks, so you focus on the
---------------------------------------------------