# DPO Training on 411 Clean Gricean Pairs

**Purpose:** Establish baseline with conflict-free cooperative data

This notebook trains a DPO model on 411 pairs where ALL 4 Gricean maxim margins are positive:
- Quantity > 0
- Quality > 0  
- Relation > 0
- Manner > 0

**Expected outcome:** Small but consistent improvement in manner without regression on other maxims.

## 1. Setup & Installation

In [None]:
# Install required packages
!pip install -q transformers datasets accelerate peft trl bitsandbytes
!pip install -q huggingface_hub

import os
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Load Clean Dataset

⚠️ **Before running:** Upload `clean_dpo_pairs.json` to your Kaggle dataset

In [None]:
# Path to your uploaded dataset
# Adjust this path based on your Kaggle dataset name
DATA_PATH = "/kaggle/input/gricebench-clean-dpo/clean_dpo_pairs.json"

# Load the clean pairs
with open(DATA_PATH, 'r', encoding='utf-8') as f:
    clean_pairs = json.load(f)

print(f"Loaded {len(clean_pairs)} clean DPO pairs")

# Show sample
if clean_pairs:
    sample = clean_pairs[0]
    print(f"\nSample pair:")
    print(f"  Prompt: {sample['prompt'][:100]}...")
    print(f"  Chosen: {sample['chosen'][:80]}...")
    print(f"  Rejected: {sample['rejected'][:80]}...")
    print(f"  Margins: qty={sample['margins']['quantity']:.4f}, qlt={sample['margins']['quality']:.4f}, rel={sample['margins']['relation']:.4f}, man={sample['margins']['manner']:.4f}")

## 3. Prepare Dataset for DPO

In [None]:
def prepare_dpo_dataset(pairs):
    """Convert pairs to DPO format."""
    formatted = []
    for pair in pairs:
        formatted.append({
            'prompt': pair['prompt'],
            'chosen': pair['chosen'],
            'rejected': pair['rejected']
        })
    return Dataset.from_list(formatted)

# Create dataset
dataset = prepare_dpo_dataset(clean_pairs)
print(f"Dataset size: {len(dataset)}")

# Split into train/eval (90/10)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split['train']
eval_dataset = split['test']

print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

## 4. Load Base Model with Quantization

In [None]:
# Model configuration
MODEL_NAME = "HuggingFaceTB/SmolLM2-360M-Instruct"  # Small model for quick training
# Alternative: "microsoft/DialoGPT-medium" or "facebook/opt-350m"

# Quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model
print(f"Loading {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded! Vocab size: {len(tokenizer)}")

## 5. Configure LoRA for Efficient Training

In [None]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Create reference model (frozen copy for DPO)
ref_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("LoRA config ready!")

## 6. Training Configuration

In [None]:
# Output directory
OUTPUT_DIR = "/kaggle/working/dpo_411_clean"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Training arguments - optimized for small dataset
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,           # More epochs for small dataset
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 8
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    report_to="none",
    remove_unused_columns=False
)

print(f"Training config ready!")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")

## 7. Initialize DPO Trainer

In [None]:
# Initialize DPO trainer
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    peft_config=lora_config,
    beta=0.1,              # DPO temperature - lower = stronger preference
    max_length=512,
    max_prompt_length=256
)

print("DPO Trainer initialized!")
print(f"  Beta (temperature): 0.1")
print(f"  Max length: 512")

## 8. Train the Model

In [None]:
print("Starting DPO training on 411 clean pairs...")
print("="*50)

# Train
train_result = dpo_trainer.train()

print("="*50)
print("Training complete!")
print(f"  Total steps: {train_result.global_step}")
print(f"  Final loss: {train_result.training_loss:.4f}")

## 9. Save the Model

In [None]:
# Save the trained model
FINAL_MODEL_DIR = "/kaggle/working/dpo_411_clean_final"
dpo_trainer.save_model(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

print(f"Model saved to {FINAL_MODEL_DIR}")

# List saved files
import os
for f in os.listdir(FINAL_MODEL_DIR):
    size = os.path.getsize(os.path.join(FINAL_MODEL_DIR, f)) / 1024 / 1024
    print(f"  {f}: {size:.2f} MB")

## 10. Quick Validation - Generate Sample Responses

In [None]:
# Test the trained model
def generate_response(model, tokenizer, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test prompts
test_prompts = [
    "Context: [agent_1]: Do you like Star Wars? [agent_2]: Yes I love the original trilogy!\nEvidence: Personal Knowledge\n\nGenerate a cooperative response:",
    "Context: [agent_1]: What do you think about cats? [agent_2]: I have two cats myself.\nEvidence: Personal Knowledge\n\nGenerate a cooperative response:"
]

print("Sample generations from trained model:")
print("="*50)
for i, prompt in enumerate(test_prompts, 1):
    response = generate_response(model, tokenizer, prompt)
    print(f"\nTest {i}:")
    print(f"Prompt: {prompt[:80]}...")
    print(f"Response: {response}")
    print("-"*50)

## 11. Download Instructions

After training completes:

1. **Download the model files** from `/kaggle/working/dpo_411_clean_final/`
2. **Key files to download:**
   - `adapter_config.json`
   - `adapter_model.safetensors` (or `.bin`)
   - `tokenizer_config.json`
   - `special_tokens_map.json`

3. **Next steps based on results:**
   - If manner improved → Relax manner threshold to get more data
   - If weak improvement → Use synthetic generation
   - If no effect → Skip to synthetic generation

In [None]:
# Create a zip file for easy download
import shutil

ZIP_PATH = "/kaggle/working/dpo_411_clean_model.zip"
shutil.make_archive(
    ZIP_PATH.replace('.zip', ''),
    'zip',
    FINAL_MODEL_DIR
)

print(f"\n✅ Model zipped to: {ZIP_PATH}")
print("Download this file from the Output tab!")