# üéØ PHASE 2: Triangulated DPO Evaluation

## Gold-Standard Validation Pipeline

This notebook answers the core question:
> **"Did DPO actually change model preferences in the intended way, without regressions?"**

### Three Independent Validations:
1. **Preference Accuracy** ‚Äî Quantitative, on held-out data
2. **Behavioral Evaluation** ‚Äî Qualitative, generated outputs
3. **Failure Comparison** ‚Äî Direct comparison to original failures

### Requirements:
- **Datasets:** `final_dpo_dataset.json`, `dpo_merged_model/`
- **GPU:** T4 (for fast inference)
- **Runtime:** ~30-45 minutes

---

In [None]:
# Cell 1: Environment Setup
import os
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

!pip install -q transformers accelerate torch

import warnings
warnings.filterwarnings('ignore')

import torch
import json
import random
from tqdm.auto import tqdm

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

print("\n‚úÖ Environment ready")

In [None]:
# Cell 2: Load Models & Data
from transformers import AutoTokenizer, AutoModelForCausalLM

print("="*80)
print("LOADING MODELS & DATA")
print("="*80)

# Find model path
MODEL_PATH = None
for p in ["/kaggle/input/dpo-model/dpo_merged_model",
          "/kaggle/input/dpo-merged-model/dpo_merged_model",
          "/kaggle/input/aligned-model/dpo_merged_model"]:
    if os.path.exists(p): MODEL_PATH = p; break

# Find dataset path
DATA_PATH = None
for p in ["/kaggle/input/final-dpo-dataset/final_dpo_dataset.json",
          "/kaggle/input/dpo-dataset/final_dpo_dataset.json"]:
    if os.path.exists(p): DATA_PATH = p; break

if not MODEL_PATH:
    print("Available inputs:")
    for item in os.listdir("/kaggle/input/"):
        print(f"  {item}")
    raise FileNotFoundError("Upload dpo_merged_model folder!")

if not DATA_PATH:
    raise FileNotFoundError("Upload final_dpo_dataset.json!")

print(f"\nüìÇ Model: {MODEL_PATH}")
print(f"üìÇ Data: {DATA_PATH}")

# Load DPO-aligned model
print(f"\nüì• Loading DPO-aligned model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

dpo_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
dpo_model.eval()
print(f"‚úÖ DPO model loaded on {dpo_model.device}")

# Load base model for comparison
print(f"\nüì• Loading base model (for comparison)...")
BASE_MODEL = "HuggingFaceTB/SmolLM2-360M-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
base_model.eval()
print(f"‚úÖ Base model loaded")

# Load data
print(f"\nüì• Loading evaluation data...")
with open(DATA_PATH) as f:
    all_data = json.load(f)

print(f"‚úÖ Loaded {len(all_data)} pairs")

# Split into sources
human_data = [d for d in all_data if d.get('source') == 'human_clean']
synth_data = [d for d in all_data if d.get('source') != 'human_clean']

print(f"   Human pairs: {len(human_data)}")
print(f"   Synthetic pairs: {len(synth_data)}")

In [None]:
# Cell 3: Helper Functions
print("\n" + "="*80)
print("SETTING UP EVALUATION FUNCTIONS")
print("="*80)

def get_response_logprob(model, tokenizer, prompt, response):
    """Compute log probability of response given prompt"""
    text = f"{prompt}\n\nResponse: {response}"
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        # Negative loss = log probability
        return -outputs.loss.item()

def generate_response(model, tokenizer, prompt, max_length=100):
    """Generate a response to a prompt"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            top_p=0.9
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

print("‚úÖ Helper functions defined")
print("   - get_response_logprob(): Computes log-prob of response")
print("   - generate_response(): Generates text from model")

In [None]:
# Cell 4: VALIDATION 1 ‚Äî Preference Accuracy (Quantitative)
print("\n" + "="*80)
print("üìä VALIDATION 1: PREFERENCE ACCURACY")
print("="*80)
print("\nGoal: Verify model prefers 'chosen' over 'rejected' on held-out data")
print("Method: Compare log-probabilities")
print("Success: ‚â•95% on human data, ‚â•85% on synthetic data\n")

# Sample held-out data
random.seed(42)  # Reproducibility
human_sample = random.sample(human_data, min(100, len(human_data)))
synth_sample = random.sample(synth_data, min(200, len(synth_data)))

print(f"Evaluating: {len(human_sample)} human + {len(synth_sample)} synthetic pairs\n")

# Evaluate human data
print("üîç Evaluating on HUMAN data...")
human_correct = 0
human_margins = []

for item in tqdm(human_sample, desc="Human pairs"):
    chosen_logp = get_response_logprob(dpo_model, tokenizer, item['prompt'], item['chosen'])
    rejected_logp = get_response_logprob(dpo_model, tokenizer, item['prompt'], item['rejected'])
    
    margin = chosen_logp - rejected_logp
    human_margins.append(margin)
    
    if margin > 0:
        human_correct += 1

human_accuracy = 100 * human_correct / len(human_sample)
human_avg_margin = sum(human_margins) / len(human_margins)

print(f"\n   ‚úÖ Human Accuracy: {human_accuracy:.1f}%")
print(f"   Average margin: {human_avg_margin:.4f}")

# Evaluate synthetic data
print("\nüîç Evaluating on SYNTHETIC data...")
synth_correct = 0
synth_margins = []

for item in tqdm(synth_sample, desc="Synthetic pairs"):
    chosen_logp = get_response_logprob(dpo_model, tokenizer, item['prompt'], item['chosen'])
    rejected_logp = get_response_logprob(dpo_model, tokenizer, item['prompt'], item['rejected'])
    
    margin = chosen_logp - rejected_logp
    synth_margins.append(margin)
    
    if margin > 0:
        synth_correct += 1

synth_accuracy = 100 * synth_correct / len(synth_sample)
synth_avg_margin = sum(synth_margins) / len(synth_margins)

print(f"\n   ‚úÖ Synthetic Accuracy: {synth_accuracy:.1f}%")
print(f"   Average margin: {synth_avg_margin:.4f}")

# Overall results
total_correct = human_correct + synth_correct
total_samples = len(human_sample) + len(synth_sample)
overall_accuracy = 100 * total_correct / total_samples

print(f"\n" + "-"*40)
print(f"üìä PREFERENCE ACCURACY SUMMARY")
print(f"-"*40)
print(f"   Human pairs: {human_accuracy:.1f}% ({human_correct}/{len(human_sample)})")
print(f"   Synthetic pairs: {synth_accuracy:.1f}% ({synth_correct}/{len(synth_sample)})")
print(f"   OVERALL: {overall_accuracy:.1f}% ({total_correct}/{total_samples})")

# Success check
if human_accuracy >= 95:
    print(f"\n   ‚úÖ PASSED: Human accuracy ‚â• 95%")
else:
    print(f"\n   ‚ö†Ô∏è WARNING: Human accuracy < 95%")

if synth_accuracy >= 85:
    print(f"   ‚úÖ PASSED: Synthetic accuracy ‚â• 85%")
else:
    print(f"   ‚ö†Ô∏è WARNING: Synthetic accuracy < 85%")

In [None]:
# Cell 5: VALIDATION 2 ‚Äî Behavioral Evaluation (Qualitative)
print("\n" + "="*80)
print("üîç VALIDATION 2: BEHAVIORAL EVALUATION")
print("="*80)
print("\nGoal: Ensure alignment improved responses without regressions")
print("Method: Compare Base model vs DPO model on same prompts")
print("Check: Relevance, Clarity, Cooperation, No verbosity explosion\n")

# Test prompts from different categories
test_prompts = [
    # From original dataset
    "Context: [agent_1]: Do you follow politics? [agent_2]: Sometimes, the electoral system is interesting.\nEvidence: FS2\n\nGenerate a cooperative response:",
    
    "Context: [agent_1]: What's your favorite movie? [agent_2]: I love sci-fi. Star Wars is classic.\nEvidence: FS1\n\nGenerate a cooperative response:",
    
    "Context: [agent_1]: I'm learning guitar. [agent_2]: That's cool! Music is therapeutic.\nEvidence: Personal Knowledge\n\nGenerate a cooperative response:",
    
    # Edge cases
    "Context: [agent_1]: Hi [agent_2]: Hello!\nEvidence: Personal Knowledge\n\nGenerate a cooperative response:",
    
    "Context: [agent_1]: Did you know sharks have no bones? [agent_2]: That's fascinating!\nEvidence: FS3\n\nGenerate a cooperative response:"
]

behavioral_results = []

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{'='*60}")
    print(f"TEST {i}")
    print(f"{'='*60}")
    
    # Extract context for display
    context = prompt.split("Evidence:")[0].strip()
    print(f"\nüìù Context: {context[:100]}...")
    
    # Generate from both models
    base_response = generate_response(base_model, tokenizer, prompt)
    dpo_response = generate_response(dpo_model, tokenizer, prompt)
    
    print(f"\nüî¥ BASE MODEL:")
    print(f"   {base_response[:200]}")
    
    print(f"\nüü¢ DPO MODEL:")
    print(f"   {dpo_response[:200]}")
    
    # Quick metrics
    base_len = len(base_response.split())
    dpo_len = len(dpo_response.split())
    
    print(f"\nüìä Metrics:")
    print(f"   Base length: {base_len} words")
    print(f"   DPO length: {dpo_len} words")
    print(f"   Ratio: {dpo_len/max(base_len,1):.2f}x")
    
    behavioral_results.append({
        'prompt': prompt,
        'base_response': base_response,
        'dpo_response': dpo_response,
        'base_len': base_len,
        'dpo_len': dpo_len
    })

# Summary
avg_base_len = sum(r['base_len'] for r in behavioral_results) / len(behavioral_results)
avg_dpo_len = sum(r['dpo_len'] for r in behavioral_results) / len(behavioral_results)

print(f"\n{'='*60}")
print(f"BEHAVIORAL SUMMARY")
print(f"{'='*60}")
print(f"   Average base length: {avg_base_len:.1f} words")
print(f"   Average DPO length: {avg_dpo_len:.1f} words")
print(f"   Length ratio: {avg_dpo_len/avg_base_len:.2f}x")

if avg_dpo_len / avg_base_len < 2.0:
    print(f"\n   ‚úÖ No verbosity explosion detected")
else:
    print(f"\n   ‚ö†Ô∏è Warning: DPO responses may be too verbose")

In [None]:
# Cell 6: VALIDATION 3 ‚Äî Failure Comparison (Most Important)
print("\n" + "="*80)
print("üéØ VALIDATION 3: DIRECT FAILURE COMPARISON")
print("="*80)
print("\nGoal: Prove alignment actually FIXED the original problems")
print("Method: Compare DPO output to original failed responses")
print("Check: Is DPO more cooperative? Is meaning preserved?\n")

# Sample pairs for comparison
comparison_sample = random.sample(synth_data, min(10, len(synth_data)))

comparison_results = []

for i, item in enumerate(comparison_sample, 1):
    print(f"\n{'='*70}")
    print(f"COMPARISON {i}")
    print(f"{'='*70}")
    
    prompt = item['prompt']
    original_failed = item['rejected']  # Original failed response
    synthetic_chosen = item['chosen']   # What we trained as "good"
    
    # Generate DPO model's response
    dpo_generated = generate_response(dpo_model, tokenizer, prompt)
    
    # Extract context
    context = prompt.split("Evidence:")[0].strip()[-150:]
    print(f"\nüìù Context: ...{context}")
    
    print(f"\n‚ùå ORIGINAL FAILED:")
    print(f"   {original_failed[:150]}")
    
    print(f"\n‚úÖ TRAINING TARGET (chosen):")
    print(f"   {synthetic_chosen[:150]}")
    
    print(f"\nüü¢ DPO MODEL GENERATED:")
    print(f"   {dpo_generated[:150]}")
    
    # Compute preference scores
    failed_logp = get_response_logprob(dpo_model, tokenizer, prompt, original_failed)
    dpo_logp = get_response_logprob(dpo_model, tokenizer, prompt, dpo_generated)
    
    prefers_generated = dpo_logp > failed_logp
    
    print(f"\nüìä Model Preference:")
    print(f"   Failed logprob: {failed_logp:.4f}")
    print(f"   Generated logprob: {dpo_logp:.4f}")
    print(f"   ‚Üí Model prefers: {'‚úÖ Generated' if prefers_generated else '‚ùå Failed'}")
    
    comparison_results.append({
        'prompt': prompt,
        'original_failed': original_failed,
        'dpo_generated': dpo_generated,
        'prefers_generated': prefers_generated
    })

# Summary
prefers_count = sum(1 for r in comparison_results if r['prefers_generated'])

print(f"\n{'='*70}")
print(f"FAILURE COMPARISON SUMMARY")
print(f"{'='*70}")
print(f"   Model prefers generated over failed: {prefers_count}/{len(comparison_results)} ({100*prefers_count/len(comparison_results):.0f}%)")

if prefers_count / len(comparison_results) >= 0.8:
    print(f"\n   ‚úÖ PASSED: DPO model consistently prefers cooperative responses")
else:
    print(f"\n   ‚ö†Ô∏è WARNING: Model may not be fully aligned")

In [None]:
# Cell 7: FINAL REPORT
print("\n" + "="*80)
print("üèÜ FINAL EVALUATION REPORT")
print("="*80)

print(f"\nüìä VALIDATION 1: PREFERENCE ACCURACY")
print(f"   Human pairs: {human_accuracy:.1f}%")
print(f"   Synthetic pairs: {synth_accuracy:.1f}%")
print(f"   Overall: {overall_accuracy:.1f}%")
v1_pass = human_accuracy >= 95 and synth_accuracy >= 85
print(f"   Status: {'‚úÖ PASSED' if v1_pass else '‚ö†Ô∏è NEEDS REVIEW'}")

print(f"\nüìä VALIDATION 2: BEHAVIORAL EVALUATION")
print(f"   Average response length: {avg_dpo_len:.1f} words")
print(f"   Length ratio (vs base): {avg_dpo_len/avg_base_len:.2f}x")
v2_pass = avg_dpo_len / avg_base_len < 2.0
print(f"   Status: {'‚úÖ PASSED' if v2_pass else '‚ö†Ô∏è NEEDS REVIEW'}")

print(f"\nüìä VALIDATION 3: FAILURE COMPARISON")
print(f"   Prefers generated over failed: {100*prefers_count/len(comparison_results):.0f}%")
v3_pass = prefers_count / len(comparison_results) >= 0.8
print(f"   Status: {'‚úÖ PASSED' if v3_pass else '‚ö†Ô∏è NEEDS REVIEW'}")

print(f"\n{'='*60}")
all_pass = v1_pass and v2_pass and v3_pass
if all_pass:
    print(f"\nüéâ ALL VALIDATIONS PASSED!")
    print(f"\nConclusion:")
    print(f"   ‚úÖ DPO training successfully aligned the model")
    print(f"   ‚úÖ Model now prefers Gricean-cooperative responses")
    print(f"   ‚úÖ No significant regressions detected")
    print(f"   ‚úÖ Ready for production use or further training")
else:
    print(f"\n‚ö†Ô∏è SOME VALIDATIONS NEED REVIEW")
    print(f"\nRecommendations:")
    if not v1_pass:
        print(f"   - Preference accuracy below target")
    if not v2_pass:
        print(f"   - Response length may be too long")
    if not v3_pass:
        print(f"   - Model doesn't consistently prefer cooperative responses")

print(f"\n{'='*80}")
print(f"‚ú® PHASE 2 EVALUATION COMPLETE")
print(f"={'='*80}")

In [None]:
# Cell 8: Save Results
import json

print("\nüíæ SAVING EVALUATION RESULTS")
print("="*80)

# Compile all results
evaluation_report = {
    'validation_1_preference_accuracy': {
        'human_accuracy': human_accuracy,
        'synthetic_accuracy': synth_accuracy,
        'overall_accuracy': overall_accuracy,
        'human_avg_margin': human_avg_margin,
        'synth_avg_margin': synth_avg_margin,
        'passed': v1_pass
    },
    'validation_2_behavioral': {
        'avg_base_length': avg_base_len,
        'avg_dpo_length': avg_dpo_len,
        'length_ratio': avg_dpo_len / avg_base_len,
        'passed': v2_pass,
        'samples': behavioral_results
    },
    'validation_3_failure_comparison': {
        'prefers_generated_pct': 100 * prefers_count / len(comparison_results),
        'passed': v3_pass,
        'samples': comparison_results
    },
    'overall': {
        'all_passed': all_pass,
        'total_pairs_evaluated': total_samples,
        'model_path': MODEL_PATH
    }
}

# Save report
report_path = "/kaggle/working/evaluation_report.json"
with open(report_path, 'w') as f:
    json.dump(evaluation_report, f, indent=2, default=str)

print(f"\n‚úÖ Report saved: {report_path}")
print(f"\nDownload this file for your records.")