# DPO Training - 411 Clean Gricean Pairs

**Important Instructions:**
1. Enable GPU: Settings ‚Üí Accelerator ‚Üí GPU T4 x2
2. Add your dataset: + Add Data ‚Üí gricebench-clean-dpo
3. Run Cell 1, then RESTART KERNEL
4. Run Cell 2 (this does everything and saves automatically)

In [None]:
# ============================================
# CELL 1: INSTALL PACKAGES
# After this cell, RESTART THE KERNEL!
# Runtime ‚Üí Restart session
# ============================================

# Install without breaking Kaggle's environment
!pip install -q trl==0.8.6 peft==0.10.0 bitsandbytes accelerate --no-deps
!pip install -q safetensors huggingface_hub

print("="*50)
print("‚úÖ INSTALLATION COMPLETE!")
print("="*50)
print("\n‚ö†Ô∏è  NOW RESTART THE KERNEL:")
print("    Runtime ‚Üí Restart session")
print("\nThen run Cell 2")

In [None]:
# ============================================
# CELL 2: COMPLETE TRAINING PIPELINE
# This cell does EVERYTHING:
# - Loads data
# - Loads model
# - Trains DPO
# - Saves model
# - Zips for download
# ============================================

import os
import json
import torch
import shutil
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import DPOTrainer

print("‚úÖ All imports successful!")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# ============================================
# STEP 1: LOAD DATA
# ============================================
print("\n" + "="*50)
print("STEP 1: Loading data...")
print("="*50)

# Try multiple possible paths
possible_paths = [
    "/kaggle/input/gricebench-clean-dpo/clean_dpo_pairs.json",
    "/kaggle/input/clean-dpo-pairs/clean_dpo_pairs.json",
    "/kaggle/input/gricebench/clean_dpo_pairs.json"
]

DATA_PATH = None
for path in possible_paths:
    if os.path.exists(path):
        DATA_PATH = path
        break

if DATA_PATH is None:
    print("‚ùå ERROR: Could not find clean_dpo_pairs.json")
    print("Available datasets:")
    for item in os.listdir("/kaggle/input"):
        print(f"  - /kaggle/input/{item}")
    raise FileNotFoundError("Please check your dataset path!")

print(f"Found data at: {DATA_PATH}")

with open(DATA_PATH, 'r', encoding='utf-8') as f:
    clean_pairs = json.load(f)

print(f"Loaded {len(clean_pairs)} clean DPO pairs")

# Prepare dataset
formatted = []
for p in clean_pairs:
    formatted.append({
        'prompt': p['prompt'],
        'chosen': p['chosen'],
        'rejected': p['rejected']
    })

dataset = Dataset.from_list(formatted)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split['train']
eval_dataset = split['test']

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

# ============================================
# STEP 2: LOAD MODEL
# ============================================
print("\n" + "="*50)
print("STEP 2: Loading model...")
print("="*50)

MODEL_NAME = "HuggingFaceTB/SmolLM2-360M-Instruct"

# 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare for training
model = prepare_model_for_kbit_training(model)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"‚úÖ Model loaded: {MODEL_NAME}")

# ============================================
# STEP 3: CONFIGURE LORA
# ============================================
print("\n" + "="*50)
print("STEP 3: Configuring LoRA...")
print("="*50)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

print("‚úÖ LoRA config ready")

# ============================================
# STEP 4: CONFIGURE TRAINING
# ============================================
print("\n" + "="*50)
print("STEP 4: Configuring training...")
print("="*50)

OUTPUT_DIR = "/kaggle/working/dpo_411"
os.makedirs(OUTPUT_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
    dataloader_pin_memory=False
)

print(f"‚úÖ Training config ready")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")

# ============================================
# STEP 5: INITIALIZE DPO TRAINER
# ============================================
print("\n" + "="*50)
print("STEP 5: Initializing DPO Trainer...")
print("="*50)

# NOTE: ref_model=None when using peft_config
# DPO will use the base model as reference automatically
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,  # Required when using peft_config
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    peft_config=lora_config,
    beta=0.1,
    max_length=512,
    max_prompt_length=256
)

print("‚úÖ DPO Trainer initialized!")

# ============================================
# STEP 6: TRAIN
# ============================================
print("\n" + "="*50)
print("STEP 6: Starting training...")
print("="*50)

train_result = dpo_trainer.train()

print("\n" + "="*50)
print("‚úÖ TRAINING COMPLETE!")
print("="*50)

# ============================================
# STEP 7: SAVE MODEL (IMMEDIATELY!)
# ============================================
print("\n" + "="*50)
print("STEP 7: Saving model...")
print("="*50)

FINAL_DIR = "/kaggle/working/dpo_411_final"
os.makedirs(FINAL_DIR, exist_ok=True)

# Save model and tokenizer
dpo_trainer.save_model(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)

print(f"‚úÖ Model saved to {FINAL_DIR}")

# List saved files
print("\nSaved files:")
for f in os.listdir(FINAL_DIR):
    size = os.path.getsize(os.path.join(FINAL_DIR, f)) / 1024
    print(f"   {f}: {size:.1f} KB")

# ============================================
# STEP 8: ZIP FOR DOWNLOAD
# ============================================
print("\n" + "="*50)
print("STEP 8: Creating zip file...")
print("="*50)

ZIP_PATH = "/kaggle/working/dpo_411_model"
shutil.make_archive(ZIP_PATH, 'zip', FINAL_DIR)

zip_size = os.path.getsize(ZIP_PATH + ".zip") / 1024 / 1024
print(f"\n‚úÖ ZIP CREATED: {ZIP_PATH}.zip ({zip_size:.1f} MB)")

# ============================================
# DONE!
# ============================================
print("\n" + "="*50)
print("üéâ ALL DONE!")
print("="*50)
print("\nDownload your model:")
print("1. Click on the folder icon (üìÅ) on the left")
print("2. Navigate to /kaggle/working/")
print("3. Download 'dpo_411_model.zip'")
print("\nOr go to Output tab after saving the notebook.")

In [None]:
# ============================================
# OPTIONAL: TEST THE MODEL
# ============================================

def generate(prompt, max_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=True, temperature=0.7)
    return tokenizer.decode(out[0], skip_special_tokens=True)

test_prompt = "Context: [agent_1]: Do you like Star Wars?\nEvidence: Personal Knowledge\n\nGenerate a cooperative response:"
print(f"Prompt: {test_prompt}")
print(f"\nResponse: {generate(test_prompt)}")