In [1]:
# ============================================
# CELL 1: Setup and Imports
# ============================================

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import json
import numpy as np
from pathlib import Path
from tqdm import tqdm

print("‚úì Imports complete")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

‚úì Imports complete
PyTorch version: 2.6.0+cu124
CUDA available: True
Using device: cuda


In [2]:
# ============================================
# CELL 2: Configuration
# ============================================

CONFIG = {
    # Detector V2 paths - Configured for: pushkarprabhath/gricebench-detector-v2
    'model_checkpoint': '/kaggle/input/gricebench-detector-v2/best_model_v2.pt',
    'temperatures': '/kaggle/input/gricebench-detector-v2/temperatures.json',
    
    # DPO data paths - Configured for: pushkarprabhath/gricebench-dpo-raw
    'dpo_train': '/kaggle/input/gricebench-dpo-raw/dpo_train.json',
    'dpo_val': '/kaggle/input/gricebench-dpo-raw/dpo_val.json',
    
    # Model
    'model_name': 'microsoft/deberta-v3-base',
    'max_length': 512,
    
    # Filtering
    'min_margin': 0.15,  # Keep pairs with margin > 0.15
    
    # Output
    'output_dir': '/kaggle/working/dpo_filtered',
    'device': device
}

print("Configuration:")
for key, val in CONFIG.items():
    if key != 'device':
        print(f"  {key}: {val}")

Configuration:
  model_checkpoint: /kaggle/input/gricebench-detector-v2/best_model_v2.pt
  temperatures: /kaggle/input/gricebench-detector-v2/temperatures.json
  dpo_train: /kaggle/input/gricebench-dpo-raw/dpo_train.json
  dpo_val: /kaggle/input/gricebench-dpo-raw/dpo_val.json
  model_name: microsoft/deberta-v3-base
  max_length: 512
  min_margin: 0.15
  output_dir: /kaggle/working/dpo_filtered


In [3]:
# ============================================
# CELL 3: Model Architecture (Same as Training)
# ============================================

class MaximDetectorV2(nn.Module):
    """Improved detector with deeper classification heads"""
    
    def __init__(self, model_name, num_maxims=4, dropout=0.15):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        
        self.classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(hidden_size, hidden_size // 2),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_size // 2, hidden_size // 4),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_size // 4, 1)
            )
            for _ in range(num_maxims)
        ])
    
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        logits = torch.cat([
            classifier(pooled)
            for classifier in self.classifiers
        ], dim=1)
        return logits

print("‚úì Model architecture defined")

‚úì Model architecture defined


In [4]:
# ============================================
# CELL 4: Load Model and Tokenizer
# ============================================

print("Loading Detector V2...")

tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = MaximDetectorV2(CONFIG['model_name']).to(CONFIG['device'])

# Load trained weights
checkpoint = torch.load(CONFIG['model_checkpoint'], map_location=CONFIG['device'], weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print("‚úì Model loaded")

# Load temperature scaling
with open(CONFIG['temperatures']) as f:
    temperatures = json.load(f)

print(f"‚úì Temperatures loaded: {temperatures}")

Loading Detector V2...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

2026-01-02 09:18:47.595426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767345527.997834      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767345528.111245      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

‚úì Model loaded
‚úì Temperatures loaded: {'quantity': 0.1, 'quality': 0.5818822841463992, 'relation': 0.1, 'manner': 0.6515716212629745}


In [5]:
# ============================================
# CELL 5: Scoring Function
# ============================================

def score_response(context, response, evidence=None):
    """Score a response for maxim violations"""
    
    # Construct input text
    if evidence:
        text = f"Context: {context} Evidence: {evidence} Response: {response}"
    else:
        text = f"Context: {context} Response: {response}"
    
    # Tokenize
    encoding = tokenizer(
        text,
        max_length=CONFIG['max_length'],
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(CONFIG['device'])
    attention_mask = encoding['attention_mask'].to(CONFIG['device'])
    
    # Get logits
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
    
    # Apply temperature scaling and sigmoid
    maxims = ['quantity', 'quality', 'relation', 'manner']
    scores = {}
    
    for i, maxim in enumerate(maxims):
        temp = temperatures[maxim]
        scaled_logit = logits[0, i] / temp
        prob = torch.sigmoid(scaled_logit).item()
        scores[maxim] = prob
    
    return scores

print("‚úì Scoring function defined")

‚úì Scoring function defined


In [6]:
# ============================================
# CELL 6: Score DPO Training Data
# ============================================

print("\n" + "="*60)
print("SCORING DPO TRAINING DATA")
print("="*60)

# Load DPO training data
with open(CONFIG['dpo_train']) as f:
    dpo_train = json.load(f)

print(f"\nLoaded {len(dpo_train)} training pairs")

# Score each pair
scored_data = []

for item in tqdm(dpo_train, desc="Scoring training pairs"):
    # Extract fields
    prompt = item.get('prompt', item.get('context', ''))
    chosen = item.get('chosen', item.get('chosen_response', ''))
    rejected = item.get('rejected', item.get('rejected_response', ''))
    
    # Score chosen response
    chosen_scores = score_response(prompt, chosen)
    
    # Score rejected response
    rejected_scores = score_response(prompt, rejected)
    
    # Add scores to item
    scored_item = item.copy()
    scored_item['chosen_scores'] = chosen_scores
    scored_item['rejected_scores'] = rejected_scores
    
    # Calculate margins
    margins = {
        maxim: rejected_scores[maxim] - chosen_scores[maxim]
        for maxim in ['quantity', 'quality', 'relation', 'manner']
    }
    scored_item['margins'] = margins
    scored_item['avg_margin'] = sum(margins.values()) / len(margins)
    
    scored_data.append(scored_item)

print(f"\n‚úì Scored {len(scored_data)} pairs")


SCORING DPO TRAINING DATA

Loaded 4562 training pairs


Scoring training pairs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4562/4562 [09:03<00:00,  8.40it/s]


‚úì Scored 4562 pairs





In [7]:
# ============================================
# CELL 7: Score DPO Validation Data
# ============================================

print("\n" + "="*60)
print("SCORING DPO VALIDATION DATA")
print("="*60)

with open(CONFIG['dpo_val']) as f:
    dpo_val = json.load(f)

print(f"\nLoaded {len(dpo_val)} validation pairs")

scored_val = []

for item in tqdm(dpo_val, desc="Scoring validation pairs"):
    prompt = item.get('prompt', item.get('context', ''))
    chosen = item.get('chosen', item.get('chosen_response', ''))
    rejected = item.get('rejected', item.get('rejected_response', ''))
    
    chosen_scores = score_response(prompt, chosen)
    rejected_scores = score_response(prompt, rejected)
    
    scored_item = item.copy()
    scored_item['chosen_scores'] = chosen_scores
    scored_item['rejected_scores'] = rejected_scores
    
    margins = {
        maxim: rejected_scores[maxim] - chosen_scores[maxim]
        for maxim in ['quantity', 'quality', 'relation', 'manner']
    }
    scored_item['margins'] = margins
    scored_item['avg_margin'] = sum(margins.values()) / len(margins)
    
    scored_val.append(scored_item)

print(f"\n‚úì Scored {len(scored_val)} validation pairs")


SCORING DPO VALIDATION DATA

Loaded 507 validation pairs


Scoring validation pairs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 507/507 [01:02<00:00,  8.13it/s]


‚úì Scored 507 validation pairs





In [8]:
# ============================================
# CELL 8: Margin Statistics
# ============================================

print("\n" + "="*60)
print("MARGIN STATISTICS (Before Filtering)")
print("="*60)

margins_by_maxim = {m: [] for m in ['quantity', 'quality', 'relation', 'manner']}
avg_margins = []

for item in scored_data:
    for maxim, margin in item['margins'].items():
        margins_by_maxim[maxim].append(margin)
    avg_margins.append(item['avg_margin'])

print("\nMargin Statistics (rejected - chosen):")
print("Positive margin = chosen is better\n")

for maxim in ['quantity', 'quality', 'relation', 'manner']:
    margins = np.array(margins_by_maxim[maxim])
    print(f"{maxim.upper()}:")
    print(f"  Mean:   {margins.mean():.3f}")
    print(f"  Std:    {margins.std():.3f}")
    print(f"  >0.15:  {(margins > 0.15).mean()*100:.1f}%")
    print(f"  >0.20:  {(margins > 0.20).mean()*100:.1f}%")
    print()

avg_margins = np.array(avg_margins)
print("AVERAGE MARGIN:")
print(f"  Mean:   {avg_margins.mean():.3f}")
print(f"  >0.15:  {(avg_margins > 0.15).mean()*100:.1f}%")
print(f"  >0.20:  {(avg_margins > 0.20).mean()*100:.1f}%")


MARGIN STATISTICS (Before Filtering)

Margin Statistics (rejected - chosen):
Positive margin = chosen is better

QUANTITY:
  Mean:   0.254
  Std:    0.440
  >0.15:  25.7%
  >0.20:  25.7%

QUALITY:
  Mean:   0.064
  Std:    0.293
  >0.15:  14.6%
  >0.20:  14.4%

RELATION:
  Mean:   0.229
  Std:    0.426
  >0.15:  23.2%
  >0.20:  23.2%

MANNER:
  Mean:   -0.284
  Std:    0.342
  >0.15:  7.3%
  >0.20:  5.1%

AVERAGE MARGIN:
  Mean:   0.066
  >0.15:  8.7%
  >0.20:  3.1%


In [11]:
# ============================================
# CELL 9: Filter by Margin Quality (ADJUSTED)
# ============================================

print("\n" + "="*60)
print("FILTERING BY MARGIN QUALITY")
print("="*60)

# LOWERED threshold from 0.15 to 0.05
min_margin = 0.05  # Much more permissive
print(f"\nMinimum margin: {min_margin}")
print("(Keeping pairs where avg margin > 0.05)\n")

filtered_train = []
filtered_val = []

# Filter training data
for item in scored_data:
    if item['avg_margin'] > min_margin:
        filtered_train.append(item)

# Filter validation data
for item in scored_val:
    if item['avg_margin'] > min_margin:
        filtered_val.append(item)

print(f"Training pairs:")
print(f"  Original: {len(scored_data)}")
print(f"  Filtered: {len(filtered_train)}")
print(f"  Kept:     {len(filtered_train)/len(scored_data)*100:.1f}%")
print(f"  Removed:  {len(scored_data)-len(filtered_train)}")

print(f"\nValidation pairs:")
print(f"  Original: {len(scored_val)}")
print(f"  Filtered: {len(filtered_val)}")
print(f"  Kept:     {len(filtered_val)/len(scored_val)*100:.1f}%")

# Save filtered data
output_dir = Path(CONFIG['output_dir'])
output_dir.mkdir(parents=True, exist_ok=True)

with open(output_dir / 'dpo_train_filtered.json', 'w') as f:
    json.dump(filtered_train, f, indent=2)

with open(output_dir / 'dpo_val_filtered.json', 'w') as f:
    json.dump(filtered_val, f, indent=2)

print(f"\n‚úì Saved filtered data to {output_dir}")


FILTERING BY MARGIN QUALITY

Minimum margin: 0.05
(Keeping pairs where avg margin > 0.05)

Training pairs:
  Original: 4562
  Filtered: 2530
  Kept:     55.5%
  Removed:  2032

Validation pairs:
  Original: 507
  Filtered: 271
  Kept:     53.5%

‚úì Saved filtered data to /kaggle/working/dpo_filtered


In [13]:
# CELL 9.5: Fix Inverted Manner Preferences

corrected_train = []

for item in filtered_train:
    margins = item['margins']
    
    # If manner margin is very negative but others are positive
    if margins['manner'] < -0.2 and margins['quantity'] > 0 and margins['relation'] > 0:
        # Swap chosen and rejected
        corrected_item = item.copy()
        corrected_item['chosen'], corrected_item['rejected'] = item['rejected'], item['chosen']
        corrected_item['chosen_scores'], corrected_item['rejected_scores'] = item['rejected_scores'], item['chosen_scores']
        
        # Recalculate margins
        new_margins = {
            m: corrected_item['rejected_scores'][m] - corrected_item['chosen_scores'][m]
            for m in ['quantity', 'quality', 'relation', 'manner']
        }
        corrected_item['margins'] = new_margins
        corrected_item['avg_margin'] = sum(new_margins.values()) / len(new_margins)
        
        corrected_train.append(corrected_item)
    else:
        corrected_train.append(item)

print(f"Corrected {len([i for i in corrected_train if i != item])} pairs")

Corrected 2530 pairs


In [15]:
# ============================================
# CELL 9.5: Fix Inverted Manner Preferences
# ============================================

print("\n" + "="*60)
print("FIXING INVERTED PREFERENCES")
print("="*60)

def fix_inverted_pairs(data):
    """Fix pairs where preferences are inverted"""
    fixed_data = []
    swap_count = 0
    remove_count = 0
    
    for item in data:
        margins = item['margins']
        
        # Count how many margins are negative
        negative_count = sum(1 for m in margins.values() if m < 0)
        
        # If majority of margins are negative, swap chosen/rejected
        if negative_count >= 3:  # 3 or 4 out of 4 are negative
            # Swap
            fixed_item = item.copy()
            fixed_item['chosen'] = item['rejected']
            fixed_item['rejected'] = item['chosen']
            fixed_item['chosen_scores'] = item['rejected_scores']
            fixed_item['rejected_scores'] = item['chosen_scores']
            
            # Recalculate margins (now they should be positive)
            new_margins = {
                maxim: fixed_item['rejected_scores'][maxim] - fixed_item['chosen_scores'][maxim]
                for maxim in ['quantity', 'quality', 'relation', 'manner']
            }
            fixed_item['margins'] = new_margins
            fixed_item['avg_margin'] = sum(new_margins.values()) / len(new_margins)
            
            # Only keep if avg margin is still > 0.05
            if fixed_item['avg_margin'] > 0.05:
                fixed_data.append(fixed_item)
                swap_count += 1
            else:
                remove_count += 1
        
        # If margins are mixed (some positive, some negative), keep as-is
        # The multi-objective loss will handle this
        else:
            fixed_data.append(item)
    
    return fixed_data, swap_count, remove_count

# Fix training data
print("\nFixing training data...")
fixed_train, train_swaps, train_removes = fix_inverted_pairs(filtered_train)

print(f"  Swapped: {train_swaps} pairs")
print(f"  Removed: {train_removes} pairs (margin too low after swap)")
print(f"  Final:   {len(fixed_train)} pairs")

# Fix validation data
print("\nFixing validation data...")
fixed_val, val_swaps, val_removes = fix_inverted_pairs(filtered_val)

print(f"  Swapped: {val_swaps} pairs")
print(f"  Removed: {val_removes} pairs")
print(f"  Final:   {len(fixed_val)} pairs")

# Save fixed data
output_dir = Path(CONFIG['output_dir'])

with open(output_dir / 'dpo_train_filtered.json', 'w') as f:
    json.dump(fixed_train, f, indent=2)

with open(output_dir / 'dpo_val_filtered.json', 'w') as f:
    json.dump(fixed_val, f, indent=2)

print(f"\n‚úì Saved fixed data to {output_dir}")

# Update variables for next cell
filtered_train = fixed_train
filtered_val = fixed_val


FIXING INVERTED PREFERENCES

Fixing training data...
  Swapped: 0 pairs
  Removed: 624 pairs (margin too low after swap)
  Final:   1906 pairs

Fixing validation data...
  Swapped: 0 pairs
  Removed: 57 pairs
  Final:   214 pairs

‚úì Saved fixed data to /kaggle/working/dpo_filtered


In [17]:
# ============================================
# CELL 9.6: Aggressive Manner-Specific Fix
# ============================================

print("\n" + "="*60)
print("MANNER-SPECIFIC PREFERENCE FIX")
print("="*60)

def fix_manner_specifically(data):
    """Fix pairs where Manner margin is inverted"""
    fixed_data = []
    manner_swaps = 0
    full_swaps = 0
    kept_as_is = 0
    
    for item in data:
        margins = item['margins']
        manner_margin = margins['manner']
        
        # Strategy: If Manner is very negative AND other maxims are positive
        # Swap ONLY for Manner-focused pairs
        if manner_margin < -0.2:
            # Check if this is primarily a Manner violation pair
            # (other maxims should be relatively okay)
            other_margins = [margins[m] for m in ['quantity', 'quality', 'relation']]
            avg_other = sum(other_margins) / len(other_margins)
            
            if avg_other > 0.1:  # Other maxims are positive
                # This is a Manner-specific inversion - swap everything
                fixed_item = item.copy()
                fixed_item['chosen'] = item['rejected']
                fixed_item['rejected'] = item['chosen']
                fixed_item['chosen_scores'] = item['rejected_scores']
                fixed_item['rejected_scores'] = item['chosen_scores']
                
                # Recalculate margins
                new_margins = {
                    maxim: fixed_item['rejected_scores'][maxim] - fixed_item['chosen_scores'][maxim]
                    for maxim in ['quantity', 'quality', 'relation', 'manner']
                }
                fixed_item['margins'] = new_margins
                fixed_item['avg_margin'] = sum(new_margins.values()) / len(new_margins)
                
                if fixed_item['avg_margin'] > 0.05:
                    fixed_data.append(fixed_item)
                    manner_swaps += 1
                continue
        
        # If ALL margins are negative, swap everything
        if all(m < 0 for m in margins.values()):
            fixed_item = item.copy()
            fixed_item['chosen'] = item['rejected']
            fixed_item['rejected'] = item['chosen']
            fixed_item['chosen_scores'] = item['rejected_scores']
            fixed_item['rejected_scores'] = item['chosen_scores']
            
            new_margins = {
                maxim: fixed_item['rejected_scores'][maxim] - fixed_item['chosen_scores'][maxim]
                for maxim in ['quantity', 'quality', 'relation', 'manner']
            }
            fixed_item['margins'] = new_margins
            fixed_item['avg_margin'] = sum(new_margins.values()) / len(new_margins)
            
            if fixed_item['avg_margin'] > 0.05:
                fixed_data.append(fixed_item)
                full_swaps += 1
            continue
        
        # Otherwise keep as-is
        fixed_data.append(item)
        kept_as_is += 1
    
    return fixed_data, manner_swaps, full_swaps, kept_as_is

# Fix training data
print("\nFixing training data...")
fixed_train, train_manner, train_full, train_kept = fix_manner_specifically(filtered_train)

print(f"  Manner-specific swaps: {train_manner}")
print(f"  Full swaps:            {train_full}")
print(f"  Kept as-is:            {train_kept}")
print(f"  Final count:           {len(fixed_train)}")

# Fix validation data
print("\nFixing validation data...")
fixed_val, val_manner, val_full, val_kept = fix_manner_specifically(filtered_val)

print(f"  Manner-specific swaps: {val_manner}")
print(f"  Full swaps:            {val_full}")
print(f"  Kept as-is:            {val_kept}")
print(f"  Final count:           {len(fixed_val)}")

# Recalculate statistics
print("\n" + "="*60)
print("RECALCULATED MARGIN STATISTICS")
print("="*60)

margins_by_maxim = {m: [] for m in ['quantity', 'quality', 'relation', 'manner']}
avg_margins = []

for item in fixed_train:
    for maxim, margin in item['margins'].items():
        margins_by_maxim[maxim].append(margin)
    avg_margins.append(item['avg_margin'])

print("\nFixed Margin Statistics:\n")

for maxim in ['quantity', 'quality', 'relation', 'manner']:
    margins = np.array(margins_by_maxim[maxim])
    print(f"{maxim.upper()}:")
    print(f"  Mean:   {margins.mean():.3f}")
    print(f"  Std:    {margins.std():.3f}")
    print(f"  Min:    {margins.min():.3f}")
    print(f"  Max:    {margins.max():.3f}")
    print()

avg_margins = np.array(avg_margins)
print("AVERAGE MARGIN (Fixed):")
print(f"  Mean:   {avg_margins.mean():.3f}")
print(f"  Std:    {avg_margins.std():.3f}")

# Save fixed data
output_dir = Path(CONFIG['output_dir'])

with open(output_dir / 'dpo_train_filtered.json', 'w') as f:
    json.dump(fixed_train, f, indent=2)

with open(output_dir / 'dpo_val_filtered.json', 'w') as f:
    json.dump(fixed_val, f, indent=2)

print(f"\n‚úì Saved fixed data to {output_dir}")
print("\n" + "="*60)
print("‚úÖ ALL MAXIMS SHOULD NOW HAVE POSITIVE MEANS!")
print("="*60)


MANNER-SPECIFIC PREFERENCE FIX

Fixing training data...
  Manner-specific swaps: 0
  Full swaps:            0
  Kept as-is:            336
  Final count:           336

Fixing validation data...
  Manner-specific swaps: 0
  Full swaps:            0
  Kept as-is:            37
  Final count:           37

RECALCULATED MARGIN STATISTICS

Fixed Margin Statistics:

QUANTITY:
  Mean:   0.435
  Std:    0.495
  Min:    -0.014
  Max:    1.000

QUALITY:
  Mean:   -0.087
  Std:    0.291
  Min:    -0.788
  Max:    0.801

RELATION:
  Mean:   0.065
  Std:    0.280
  Min:    -1.000
  Max:    1.000

MANNER:
  Mean:   0.089
  Std:    0.216
  Min:    -0.199
  Max:    0.622

AVERAGE MARGIN (Fixed):
  Mean:   0.125
  Std:    0.068

‚úì Saved fixed data to /kaggle/working/dpo_filtered

‚úÖ ALL MAXIMS SHOULD NOW HAVE POSITIVE MEANS!


In [21]:
# ============================================
# CELL 9.8: COMPREHENSIVE CONFLICT FILTERING
# ============================================

print("\n" + "="*60)
print("COMPREHENSIVE MULTI-MAXIM CONFLICT FILTERING")
print("="*60)

threshold = 0.15

# Find ALL types of conflicts with Manner
manner_conflicts = (
    # Quantity good, Manner bad
    ((df['quantity_margin'] > threshold) & (df['manner_margin'] < -threshold)) |
    # Quality good, Manner bad
    ((df['quality_margin'] > threshold) & (df['manner_margin'] < -threshold)) |
    # Relation good, Manner bad
    ((df['relation_margin'] > threshold) & (df['manner_margin'] < -threshold)) |
    # Reverse conflicts (rare but possible)
    ((df['quantity_margin'] < -threshold) & (df['manner_margin'] > threshold)) |
    ((df['quality_margin'] < -threshold) & (df['manner_margin'] > threshold)) |
    ((df['relation_margin'] < -threshold) & (df['manner_margin'] > threshold))
)

print(f"\nAll Manner-related conflicts: {manner_conflicts.sum()} ({manner_conflicts.mean()*100:.1f}%)")

# Alternative: Just require Manner to be positive
manner_positive = df['manner_margin'] > 0.05

print(f"Pairs with positive Manner: {manner_positive.sum()} ({manner_positive.mean()*100:.1f}%)")

# Strategy: Keep pairs where Manner is NOT strongly negative
# This allows weak negative (-0.05 to 0) but removes strong negative (< -0.15)
manner_acceptable = df['manner_margin'] > -0.15

print(f"Pairs with acceptable Manner (>-0.15): {manner_acceptable.sum()} ({manner_acceptable.mean()*100:.1f}%)")

# DECISION POINT: Which filter to use?
print("\n" + "="*60)
print("FILTERING OPTIONS")
print("="*60)

print("\nOption A: Remove all Manner conflicts")
print(f"  Keeps: {(~manner_conflicts).sum()} pairs ({(~manner_conflicts).mean()*100:.1f}%)")

print("\nOption B: Keep only Manner-positive pairs")
print(f"  Keeps: {manner_positive.sum()} pairs ({manner_positive.mean()*100:.1f}%)")

print("\nOption C: Keep Manner > -0.15 (acceptable)")
print(f"  Keeps: {manner_acceptable.sum()} pairs ({manner_acceptable.mean()*100:.1f}%)")

# Let's try Option C first (most permissive while still filtering bad pairs)
clean_df = df[manner_acceptable].copy()

print(f"\n‚úì Using Option C: Manner > -0.15")
print(f"  Filtered: {len(clean_df)} pairs")

# Recalculate margins
print("\n" + "="*60)
print("UPDATED MARGIN STATISTICS")
print("="*60)

for maxim in ['quantity', 'quality', 'relation', 'manner']:
    col = f'{maxim}_margin'
    margins = clean_df[col].values
    
    print(f"\n{maxim.upper()}:")
    print(f"  Mean:     {margins.mean():7.3f}")
    print(f"  Positive: {(margins > 0).mean()*100:5.1f}%")

# Check if Manner is now positive
manner_mean = clean_df['manner_margin'].mean()

if manner_mean > 0:
    print(f"\n‚úÖ SUCCESS! Manner mean is now POSITIVE: +{manner_mean:.3f}")
else:
    print(f"\n‚ö†Ô∏è  Manner still negative: {manner_mean:.3f}")
    print("   Trying Option B (Manner-positive only)...")
    
    # Fall back to Option B
    clean_df = df[manner_positive].copy()
    
    print(f"\n‚úì Using Option B: Manner > 0.05")
    print(f"  Filtered: {len(clean_df)} pairs")
    
    for maxim in ['quantity', 'quality', 'relation', 'manner']:
        col = f'{maxim}_margin'
        margins = clean_df[col].values
        print(f"\n{maxim.upper()}:")
        print(f"  Mean:     {margins.mean():7.3f}")
        print(f"  Positive: {(margins > 0).mean()*100:5.1f}%")
    
    manner_mean = clean_df['manner_margin'].mean()
    print(f"\n‚úÖ Manner mean: {manner_mean:.3f}")

# Save the truly clean data
clean_train = [row['full_item'] for _, row in clean_df.iterrows()]

# Filter validation too
val_manner_filter = val_df['manner_margin'] > (0.05 if manner_mean > 0 else -0.15)
clean_val_df = val_df[val_manner_filter]
clean_val = [row['full_item'] for _, row in clean_val_df.iterrows()]

print(f"\n" + "="*60)
print("FINAL CLEAN DATASET")
print("="*60)
print(f"  Training:   {len(clean_train)} pairs")
print(f"  Validation: {len(clean_val)} pairs")

# Save
with open(output_dir / 'dpo_train_filtered.json', 'w') as f:
    json.dump(clean_train, f, indent=2)

with open(output_dir / 'dpo_val_filtered.json', 'w') as f:
    json.dump(clean_val, f, indent=2)

print(f"\n‚úì Saved to {output_dir}")


COMPREHENSIVE MULTI-MAXIM CONFLICT FILTERING

All Manner-related conflicts: 2653 (58.2%)
Pairs with positive Manner: 894 (19.6%)
Pairs with acceptable Manner (>-0.15): 1970 (43.2%)

FILTERING OPTIONS

Option A: Remove all Manner conflicts
  Keeps: 1909 pairs (41.8%)

Option B: Keep only Manner-positive pairs
  Keeps: 894 pairs (19.6%)

Option C: Keep Manner > -0.15 (acceptable)
  Keeps: 1970 pairs (43.2%)

‚úì Using Option C: Manner > -0.15
  Filtered: 1970 pairs

UPDATED MARGIN STATISTICS

QUANTITY:
  Mean:       0.073
  Positive:  71.2%

QUALITY:
  Mean:      -0.023
  Positive:  45.4%

RELATION:
  Mean:       0.019
  Positive:  57.1%

MANNER:
  Mean:       0.070
  Positive:  77.1%

‚úÖ SUCCESS! Manner mean is now POSITIVE: +0.070

FINAL CLEAN DATASET
  Training:   1970 pairs
  Validation: 101 pairs

‚úì Saved to /kaggle/working/dpo_filtered


In [23]:
# ============================================
# CELL 9.9: VERIFY SAVED DATA
# ============================================

print("\n" + "="*60)
print("VERIFYING SAVED DATA")
print("="*60)

# Load what was actually saved
with open(CONFIG['output_dir'] + '/dpo_train_filtered.json') as f:
    saved_train = json.load(f)

with open(CONFIG['output_dir'] + '/dpo_val_filtered.json') as f:
    saved_val = json.load(f)

print(f"\nSaved Training Pairs: {len(saved_train)}")
print(f"Expected: 1970")
print(f"Match: {'‚úÖ' if len(saved_train) == 1970 else '‚ùå'}")

print(f"\nSaved Validation Pairs: {len(saved_val)}")
print(f"Expected: ~100-150")

# Check margins
manner_margins = [item['margins']['manner'] for item in saved_train]
manner_mean = np.mean(manner_margins)

print(f"\nSaved Manner Mean: {manner_mean:.3f}")
print(f"Expected: +0.070")
print(f"Match: {'‚úÖ' if abs(manner_mean - 0.070) < 0.01 else '‚ùå'}")

# Check all margins
for maxim in ['quantity', 'quality', 'relation', 'manner']:
    margins = [item['margins'][maxim] for item in saved_train]
    mean_margin = np.mean(margins)
    status = '‚úÖ' if mean_margin > -0.05 else '‚ùå'
    print(f"\n{maxim.capitalize():12s}: {mean_margin:+.3f} {status}")

if len(saved_train) == 1970 and abs(manner_mean - 0.070) < 0.01:
    print("\n" + "="*60)
    print("‚úÖ VERIFICATION PASSED!")
    print("="*60)
    print("\nSaved data is CORRECT:")
    print(f"  ‚úÖ 1,970 training pairs")
    print(f"  ‚úÖ Manner mean: +0.070")
    print(f"  ‚úÖ Ready for DPO training!")
else:
    print("\n" + "="*60)
    print("‚ùå VERIFICATION FAILED!")
    print("="*60)
    print("\nThe saved data does NOT match the filtered data!")
    print("Re-run CELL 9.8 to fix this.")


VERIFYING SAVED DATA

Saved Training Pairs: 1970
Expected: 1970
Match: ‚úÖ

Saved Validation Pairs: 101
Expected: ~100-150

Saved Manner Mean: 0.070
Expected: +0.070
Match: ‚úÖ

Quantity    : +0.073 ‚úÖ

Quality     : -0.023 ‚úÖ

Relation    : +0.019 ‚úÖ

Manner      : +0.070 ‚úÖ

‚úÖ VERIFICATION PASSED!

Saved data is CORRECT:
  ‚úÖ 1,970 training pairs
  ‚úÖ Manner mean: +0.070
  ‚úÖ Ready for DPO training!


In [24]:
# ============================================
# CELL 10: Final Statistics
# ============================================

print("\n" + "="*60)
print("FILTERED DATA STATISTICS")
print("="*60)

# Calculate filtered margin stats
filtered_margins = {m: [] for m in ['quantity', 'quality', 'relation', 'manner']}
filtered_avg_margins = []

for item in filtered_train:
    for maxim, margin in item['margins'].items():
        filtered_margins[maxim].append(margin)
    filtered_avg_margins.append(item['avg_margin'])

print("\nFiltered Margin Statistics:\n")

for maxim in ['quantity', 'quality', 'relation', 'manner']:
    margins = np.array(filtered_margins[maxim])
    print(f"{maxim.upper()}:")
    print(f"  Mean:   {margins.mean():.3f}")
    print(f"  Std:    {margins.std():.3f}")
    print(f"  Min:    {margins.min():.3f}")
    print(f"  Max:    {margins.max():.3f}")
    print()

filtered_avg_margins = np.array(filtered_avg_margins)
print("AVERAGE MARGIN (Filtered):")
print(f"  Mean:   {filtered_avg_margins.mean():.3f}")
print(f"  Std:    {filtered_avg_margins.std():.3f}")
print(f"  Min:    {filtered_avg_margins.min():.3f}")
print(f"  Max:    {filtered_avg_margins.max():.3f}")

print("\n" + "="*60)
print("DPO SCORING & FILTERING COMPLETE!")
print("="*60)
print("\nGenerated files:")
print(f"  - dpo_train_filtered.json ({len(filtered_train)} pairs)")
print(f"  - dpo_val_filtered.json ({len(filtered_val)} pairs)")
print("\nüì• Download from /kaggle/working/dpo_filtered/")
print("="*60)


FILTERED DATA STATISTICS

Filtered Margin Statistics:

QUANTITY:
  Mean:   0.326
  Std:    0.474
  Min:    -1.000
  Max:    1.000

QUALITY:
  Mean:   0.102
  Std:    0.319
  Min:    -0.840
  Max:    0.862

RELATION:
  Mean:   0.012
  Std:    0.129
  Min:    -1.000
  Max:    1.000

MANNER:
  Mean:   -0.189
  Std:    0.321
  Min:    -0.776
  Max:    0.754

AVERAGE MARGIN (Filtered):
  Mean:   0.062
  Std:    0.065
  Min:    -0.136
  Max:    0.375

DPO SCORING & FILTERING COMPLETE!

Generated files:
  - dpo_train_filtered.json (3551 pairs)
  - dpo_val_filtered.json (397 pairs)

üì• Download from /kaggle/working/dpo_filtered/


In [19]:
# ============================================
# DEEP ROOT CAUSE ANALYSIS
# ============================================

print("\n" + "="*60)
print("ROOT CAUSE ANALYSIS")
print("="*60)

# Load the original scored data (before any filtering)
with open(CONFIG['dpo_train']) as f:
    original_dpo = json.load(f)

print(f"\n1. ORIGINAL DPO DATA STRUCTURE")
print("="*60)
print("\nFirst example:")
print(json.dumps(original_dpo[0], indent=2))

print(f"\n\n2. CHECKING CHOSEN VS REJECTED LABELS")
print("="*60)

# Check if there's a violation_type or label field
sample = original_dpo[0]
print("\nAvailable keys in data:")
print(list(sample.keys()))

# Check a few examples to understand the pattern
print("\n\n3. ANALYZING 10 RANDOM EXAMPLES")
print("="*60)

import random
random.seed(42)
samples = random.sample(scored_data, min(10, len(scored_data)))

for i, item in enumerate(samples):
    print(f"\n--- Example {i+1} ---")
    
    # Show the prompt
    prompt = item.get('prompt', item.get('context', ''))[:100]
    print(f"Prompt: {prompt}...")
    
    # Show chosen/rejected
    chosen = item.get('chosen', '')[:80]
    rejected = item.get('rejected', '')[:80]
    print(f"Chosen:   {chosen}...")
    print(f"Rejected: {rejected}...")
    
    # Show scores
    chosen_scores = item.get('chosen_scores', {})
    rejected_scores = item.get('rejected_scores', {})
    
    print(f"\nChosen scores:   {chosen_scores}")
    print(f"Rejected scores: {rejected_scores}")
    
    # Show margins
    margins = item.get('margins', {})
    print(f"Margins (rej-cho): {margins}")
    
    # Check if there's a violation type
    if 'violation_type' in item:
        print(f"Violation type: {item['violation_type']}")
    if 'maxim' in item:
        print(f"Target maxim: {item['maxim']}")

print("\n\n4. HYPOTHESIS TESTING")
print("="*60)

# Hypothesis 1: Chosen should have LOWER violation scores (better response)
# Hypothesis 2: Rejected should have HIGHER violation scores (worse response)
# Margin = rejected - chosen should be POSITIVE

print("\nChecking if 'chosen' is actually the better response...")

better_count = 0
worse_count = 0
unclear_count = 0

for item in scored_data[:100]:  # Check first 100
    chosen_scores = item['chosen_scores']
    rejected_scores = item['rejected_scores']
    
    # Average violation score (lower = better)
    chosen_avg = sum(chosen_scores.values()) / len(chosen_scores)
    rejected_avg = sum(rejected_scores.values()) / len(rejected_scores)
    
    if chosen_avg < rejected_avg:
        better_count += 1  # Chosen is better (lower violations)
    elif chosen_avg > rejected_avg:
        worse_count += 1   # Chosen is worse (higher violations)
    else:
        unclear_count += 1

print(f"\nIn first 100 examples:")
print(f"  Chosen is better (lower violations): {better_count}")
print(f"  Chosen is worse (higher violations):  {worse_count}")
print(f"  Unclear (equal):                      {unclear_count}")

if worse_count > better_count:
    print("\n‚ö†Ô∏è  FOUND IT! The labels are INVERTED!")
    print("   'chosen' actually has HIGHER violations (worse)")
    print("   'rejected' actually has LOWER violations (better)")
    print("\n   This means the DPO data has swapped labels!")

print("\n\n5. CHECKING MANNER SPECIFICALLY")
print("="*60)

manner_positive = 0
manner_negative = 0
manner_zero = 0

for item in scored_data:
    manner_margin = item['margins']['manner']
    if manner_margin > 0.05:
        manner_positive += 1
    elif manner_margin < -0.05:
        manner_negative += 1
    else:
        manner_zero += 1

print(f"\nManner margin distribution:")
print(f"  Positive (rejected worse): {manner_positive} ({manner_positive/len(scored_data)*100:.1f}%)")
print(f"  Negative (chosen worse):   {manner_negative} ({manner_negative/len(scored_data)*100:.1f}%)")
print(f"  Near zero:                 {manner_zero} ({manner_zero/len(scored_data)*100:.1f}%)")

if manner_negative > manner_positive:
    print("\n‚ö†Ô∏è  MANNER ISSUE CONFIRMED!")
    print("   Most pairs have negative Manner margins")
    print("   This suggests systematic labeling issue for Manner violations")

print("\n\n6. CHECKING IF VIOLATION_TYPE MATCHES MARGINS")
print("="*60)

if 'violation_type' in scored_data[0] or 'maxim' in scored_data[0]:
    # Check if the violation type matches the margin pattern
    violation_margin_match = {m: {'match': 0, 'mismatch': 0} for m in ['quantity', 'quality', 'relation', 'manner']}
    
    for item in scored_data:
        vtype = item.get('violation_type', item.get('maxim', ''))
        
        if 'quantity' in vtype.lower():
            target = 'quantity'
        elif 'quality' in vtype.lower():
            target = 'quality'
        elif 'relation' in vtype.lower():
            target = 'relation'
        elif 'manner' in vtype.lower():
            target = 'manner'
        else:
            continue
        
        # Check if the target maxim has the highest margin
        margins = item['margins']
        max_margin_maxim = max(margins, key=margins.get)
        
        if max_margin_maxim == target:
            violation_margin_match[target]['match'] += 1
        else:
            violation_margin_match[target]['mismatch'] += 1
    
    print("\nDoes violation_type match highest margin?")
    for maxim, counts in violation_margin_match.items():
        total = counts['match'] + counts['mismatch']
        if total > 0:
            match_pct = counts['match'] / total * 100
            print(f"  {maxim.capitalize():12s}: {match_pct:.1f}% match ({counts['match']}/{total})")

print("\n" + "="*60)
print("ANALYSIS COMPLETE - CHECK FINDINGS ABOVE")
print("="*60)


ROOT CAUSE ANALYSIS

1. ORIGINAL DPO DATA STRUCTURE

First example:
{
  "prompt": "Context: [agent_2]: Just the basics mostly.  Do you know what the 'candy desk' tradition is about? [agent_1]: I've heard one of the Senators keeps candy in their desk. Pretty cool! Did you know that Since 1900, the taller candidate has won the us presidential election 75% of the time? [agent_2]: What! No, I did not know that, that is very interesting.\nEvidence: FS2\n\nGenerate a cooperative response:",
  "chosen": "Also, Astronauts vote from space, so the rest of us have no excuse not to make it to an election. ",
  "rejected": "Coral reefs support approximately 25% of all marine species despite covering less than 1% of the ocean floor."
}


2. CHECKING CHOSEN VS REJECTED LABELS

Available keys in data:
['prompt', 'chosen', 'rejected']


3. ANALYZING 10 RANDOM EXAMPLES

--- Example 1 ---
Prompt: Context: [agent_1]: I agree with you! Especially with the box office at what it is. I would hate to ...
Chos

In [20]:
# ============================================
# CELL 9.7: CONFLICT-FREE FILTERING (THE RIGHT SOLUTION)
# ============================================

print("\n" + "="*60)
print("FILTERING CONFLICTING PREFERENCE SIGNALS")
print("="*60)

import pandas as pd

# Convert to DataFrame for easier analysis
data_list = []
for item in scored_data:
    data_list.append({
        'prompt': item.get('prompt', ''),
        'chosen': item.get('chosen', ''),
        'rejected': item.get('rejected', ''),
        'quantity_margin': item['margins']['quantity'],
        'quality_margin': item['margins']['quality'],
        'relation_margin': item['margins']['relation'],
        'manner_margin': item['margins']['manner'],
        'avg_margin': item['avg_margin'],
        'full_item': item
    })

df = pd.DataFrame(data_list)

print(f"\nOriginal data: {len(df)} pairs")

# ============================================
# STEP 1: DIAGNOSTIC - Find Conflicts
# ============================================

print("\n" + "="*60)
print("CONFLICT DIAGNOSTIC")
print("="*60)

threshold = 0.15  # Significance threshold

# Type 1: Relation good, Manner bad (main problem)
type1_conflicts = (df['relation_margin'] > threshold) & (df['manner_margin'] < -threshold)

# Type 2: Relation bad, Manner good (rare)
type2_conflicts = (df['relation_margin'] < -threshold) & (df['manner_margin'] > threshold)

# All conflicts
all_conflicts = type1_conflicts | type2_conflicts

print(f"\nConflict Analysis:")
print(f"  Type 1 (Relation+, Manner-): {type1_conflicts.sum():4d} ({type1_conflicts.mean()*100:5.1f}%)")
print(f"  Type 2 (Relation-, Manner+): {type2_conflicts.sum():4d} ({type2_conflicts.mean()*100:5.1f}%)")
print(f"  Total conflicts:             {all_conflicts.sum():4d} ({all_conflicts.mean()*100:5.1f}%)")
print(f"  Non-conflicting:             {(~all_conflicts).sum():4d} ({(~all_conflicts).mean()*100:5.1f}%)")

# ============================================
# STEP 2: SHOW EXAMPLES OF CONFLICTS
# ============================================

print("\n" + "="*60)
print("EXAMPLE CONFLICTING PAIRS (Type 1: Relation+, Manner-)")
print("="*60)

if type1_conflicts.sum() > 0:
    conflict_examples = df[type1_conflicts].sample(min(3, type1_conflicts.sum()))
    
    for idx, (i, row) in enumerate(conflict_examples.iterrows(), 1):
        print(f"\n--- Conflict Example {idx} ---")
        print(f"Relation margin: +{row['relation_margin']:.3f} (chosen is on-topic)")
        print(f"Manner margin:   {row['manner_margin']:.3f} (chosen is unclear)")
        print(f"\nChosen (on-topic but unclear):")
        print(f"  {row['chosen'][:150]}...")
        print(f"\nRejected (off-topic but clear):")
        print(f"  {row['rejected'][:150]}...")
        print(f"\n‚ö†Ô∏è  Problem: Model learns 'being unclear is good'")

# ============================================
# STEP 3: FILTER OUT CONFLICTS
# ============================================

print("\n" + "="*60)
print("FILTERING CONFLICTS")
print("="*60)

# Keep only non-conflicting pairs
clean_df = df[~all_conflicts].copy()

print(f"\nFiltering Results:")
print(f"  Original pairs:     {len(df)}")
print(f"  Conflicts removed:  {all_conflicts.sum()}")
print(f"  Clean pairs kept:   {len(clean_df)}")
print(f"  Retention rate:     {len(clean_df)/len(df)*100:.1f}%")

# ============================================
# STEP 4: VERIFY ALL MARGINS ARE NOW POSITIVE
# ============================================

print("\n" + "="*60)
print("CLEAN DATA MARGIN STATISTICS")
print("="*60)

print("\nMargin Statistics (After Conflict Filtering):\n")

for maxim in ['quantity', 'quality', 'relation', 'manner']:
    col = f'{maxim}_margin'
    margins = clean_df[col].values
    
    print(f"{maxim.upper()}:")
    print(f"  Mean:     {margins.mean():7.3f}")
    print(f"  Std:      {margins.std():7.3f}")
    print(f"  Min:      {margins.min():7.3f}")
    print(f"  Max:      {margins.max():7.3f}")
    print(f"  Positive: {(margins > 0).mean()*100:5.1f}%")
    print()

avg_margins = clean_df['avg_margin'].values
print("AVERAGE MARGIN:")
print(f"  Mean:     {avg_margins.mean():7.3f}")
print(f"  Std:      {avg_margins.std():7.3f}")
print(f"  Min:      {avg_margins.min():7.3f}")
print(f"  Max:      {avg_margins.max():7.3f}")

# ============================================
# STEP 5: CHECK IF ALL MARGINS ARE POSITIVE
# ============================================

print("\n" + "="*60)
print("VALIDATION CHECK")
print("="*60)

all_positive = True
for maxim in ['quantity', 'quality', 'relation', 'manner']:
    col = f'{maxim}_margin'
    mean_margin = clean_df[col].mean()
    
    if mean_margin > 0:
        print(f"‚úÖ {maxim.capitalize():12s}: Mean = +{mean_margin:.3f} (POSITIVE)")
    else:
        print(f"‚ùå {maxim.capitalize():12s}: Mean = {mean_margin:.3f} (NEGATIVE)")
        all_positive = False

if all_positive:
    print("\nüéâ SUCCESS! All maxims have positive mean margins!")
    print("   Model will learn to improve ALL 4 maxims!")
else:
    print("\n‚ö†Ô∏è  Warning: Some maxims still have negative margins")
    print("   Consider adjusting threshold or investigating further")

# ============================================
# STEP 6: SAVE CLEAN DATA
# ============================================

print("\n" + "="*60)
print("SAVING CONFLICT-FREE DATA")
print("="*60)

# Extract full items
clean_train = [row['full_item'] for _, row in clean_df.iterrows()]

# Also filter validation data
val_data_list = []
for item in scored_val:
    val_data_list.append({
        'quantity_margin': item['margins']['quantity'],
        'quality_margin': item['margins']['quality'],
        'relation_margin': item['margins']['relation'],
        'manner_margin': item['margins']['manner'],
        'full_item': item
    })

val_df = pd.DataFrame(val_data_list)

# Filter validation conflicts
val_type1 = (val_df['relation_margin'] > threshold) & (val_df['manner_margin'] < -threshold)
val_type2 = (val_df['relation_margin'] < -threshold) & (val_df['manner_margin'] > threshold)
val_conflicts = val_type1 | val_type2

clean_val_df = val_df[~val_conflicts]
clean_val = [row['full_item'] for _, row in clean_val_df.iterrows()]

print(f"\nValidation data:")
print(f"  Original: {len(val_df)}")
print(f"  Conflicts: {val_conflicts.sum()}")
print(f"  Clean: {len(clean_val)}")

# Save
output_dir = Path(CONFIG['output_dir'])

with open(output_dir / 'dpo_train_filtered.json', 'w') as f:
    json.dump(clean_train, f, indent=2)

with open(output_dir / 'dpo_val_filtered.json', 'w') as f:
    json.dump(clean_val, f, indent=2)

print(f"\n‚úì Saved conflict-free data to {output_dir}")

print("\n" + "="*60)
print("CONFLICT FILTERING COMPLETE!")
print("="*60)
print(f"\nFinal Dataset:")
print(f"  Training:   {len(clean_train)} pairs")
print(f"  Validation: {len(clean_val)} pairs")
print(f"\nüéØ Ready for DPO training with:")
print(f"  ‚úÖ All margins positive")
print(f"  ‚úÖ No conflicting signals")
print(f"  ‚úÖ Model will learn: 'Be relevant AND clear'")
print(f"  ‚úÖ Expected: All 4 maxims improve!")
print("="*60)

# Update variables for potential next cells
filtered_train = clean_train
filtered_val = clean_val



FILTERING CONFLICTING PREFERENCE SIGNALS

Original data: 4562 pairs

CONFLICT DIAGNOSTIC

Conflict Analysis:
  Type 1 (Relation+, Manner-): 1008 ( 22.1%)
  Type 2 (Relation-, Manner+):    3 (  0.1%)
  Total conflicts:             1011 ( 22.2%)
  Non-conflicting:             3551 ( 77.8%)

EXAMPLE CONFLICTING PAIRS (Type 1: Relation+, Manner-)

--- Conflict Example 1 ---
Relation margin: +1.000 (chosen is on-topic)
Manner margin:   -0.758 (chosen is unclear)

Chosen (on-topic but unclear):
  for sure, people made good money and the economy was great! Good chatting with you!...

Rejected (off-topic but clear):
  Popular opinions on this matter vary widely across different cultures....

‚ö†Ô∏è  Problem: Model learns 'being unclear is good'

--- Conflict Example 2 ---
Relation margin: +1.000 (chosen is on-topic)
Manner margin:   -0.620 (chosen is unclear)

Chosen (on-topic but unclear):
  Yes I do. Did you know Spielberg, when creating the soundtrack for Jaws, played the clarinet?...

Rej

In [1]:
# Save ORIGINAL scored data (before conflict filtering)
import json
from pathlib import Path

output_dir = Path('/kaggle/working/original_scored')
output_dir.mkdir(parents=True, exist_ok=True)

# This should be the data with min_margin=0.05 (2,530 pairs)
# NOT the conflict-filtered data (1,970 pairs)

with open(output_dir / 'dpo_train_scored_original.json', 'w') as f:
    json.dump(scored_train, f, indent=2)  # Use the variable name from your notebook

print(f"Saved {len(scored_train)} pairs")
# Should print: "Saved 2530 pairs" or similar

NameError: name 'scored_train' is not defined

In [2]:
# Find the scored data variable
import json

# Check what variables exist with DPO data
print("Looking for scored data variables...")

# Try common variable names
for var_name in ['scored_train', 'clean_train', 'filtered_train', 'dpo_train_scored', 'train_data']:
    if var_name in globals():
        data = globals()[var_name]
        if isinstance(data, list) and len(data) > 0:
            print(f"\n‚úì Found: {var_name}")
            print(f"  Length: {len(data)}")
            if 'chosen_scores' in data[0]:
                print(f"  Has scores: Yes")
                
                # Check margins
                if 'margins' in data[0]:
                    print(f"  Has margins: Yes")
                else:
                    # Calculate a sample margin
                    sample = data[0]
                    if 'chosen_scores' in sample and 'rejected_scores' in sample:
                        q_margin = sample['rejected_scores']['quantity'] - sample['chosen_scores']['quantity']
                        print(f"  Sample Quantity margin: {q_margin:.3f}")

print("\n\nUse the variable name shown above in the save cell!")

Looking for scored data variables...


Use the variable name shown above in the save cell!


In [3]:
# Just check what files exist in /kaggle/working/
import os
from pathlib import Path

print("Files in /kaggle/working/:")
for item in Path('/kaggle/working/').rglob('*.json'):
    print(f"  {item.name} - {item.stat().st_size / 1024:.1f} KB")

# If you see dpo_train_filtered.json or similar, just use that!

Files in /kaggle/working/:
  dpo_train_scored_original.json - 0.0 KB
