### Barebones DPO (For setup-testing only)

In [None]:
# Import required libraries for DPO (Direct Preference Optimization) training
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the pre-trained Qwen model and tokenizer
model_name = "Qwen/Qwen2-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load and prepare datasets (using subset for faster training)
train_dataset = load_dataset(
    "trl-lib/ultrafeedback_binarized", 
    split="train"
).select(range(800))  # Use only first 1000 samples for demo

eval_dataset = load_dataset(
    "trl-lib/ultrafeedback_binarized", 
    split="test"
).select(range(200))  # Use only first 1000 samples for demo

# Configure DPO training parameters
training_args = DPOConfig(
    output_dir="Qwen2-0.5B-DPO",           # Directory to save model checkpoints
    per_device_train_batch_size=4,         # Batch size per GPU/device
    max_steps=50,                          # Total training steps
    eval_strategy="steps",                 # Evaluate
    eval_steps=10,                         # Evaluate every 25 steps
    report_to="tensorboard"                # Log metrics to TensorBoard
)

# Initialize the DPO trainer
trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start training
print("Starting DPO training...")
trainer.train()
print("Training completed!")