# Score Synthetic Candidates

**Goal:** Validate the Gemini-generated responses using our locally trained Gricean reward models.

**Instructions:**
1. **Upload Data:** Add `synthetic_candidates.json` (downloaded from generation step) to this notebook.
2. **Add Models:** Add your `grice-reward-models` dataset.
3. **Run All:** This will calculate margins for the new synthetic pairs.

In [None]:
!pip install -q transformers torch safetensors tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import os
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# Load Candidates
# Look for synthetic_candidates.json
CANDIDATES_PATH = None
possible_paths = [
    "/kaggle/input/synthetic-candidates/synthetic_candidates.json",
    "synthetic_candidates.json"
]
for p in possible_paths:
    if os.path.exists(p):
        CANDIDATES_PATH = p
        break

if not CANDIDATES_PATH:
    print("‚ùå synthetic_candidates.json not found!")
    # Stop or use dummy for testing

with open(CANDIDATES_PATH, 'r') as f:
    candidates = json.load(f)

print(f"Loaded {len(candidates)} candidates for scoring.")

In [None]:
# Load Reward Models
MODELS_DIR = "/kaggle/input/grice-reward-models/"
MAXIMS = ['quantity', 'quality', 'relation', 'manner']
models = {}
tokenizers = {}

for maxim in MAXIMS:
    path = os.path.join(MODELS_DIR, f"reward_model_{maxim}")
    print(f"Loading {maxim} model from {path}...")
    try:
        models[maxim] = AutoModelForSequenceClassification.from_pretrained(path).to(device).eval()
        tokenizers[maxim] = AutoTokenizer.from_pretrained(path)
    except Exception as e:
        print(f"Error loading {maxim}: {e}")

def get_score(maxim, text):
    inputs = tokenizers[maxim](text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = models[maxim](**inputs)
        # Assuming scalar output from reward model
        score = outputs.logits[0].item()
    return score

In [None]:
# Score Generation Loop
scored_results = []

for item in tqdm(candidates):
    prompt = item['prompt']
    chosen = item['synthetic_chosen'] # The new cooperative response
    rejected = item['original_chosen_failed'] # The old bad response
    
    # Construction for scoring (Prompt + Response)
    # Note: Adjust format based on how reward models were trained (usually Input + Response)
    text_chosen = f"{prompt}\n{chosen}"
    text_rejected = f"{prompt}\n{rejected}"
    
    margins = {}
    valid = True
    
    for maxim in MAXIMS:
        s_chosen = get_score(maxim, text_chosen)
        s_rejected = get_score(maxim, text_rejected)
        margin = s_chosen - s_rejected
        margins[maxim] = margin
        
        if margin <= 0:
            valid = False
            
    item['synthetic_margins'] = margins
    item['synthetic_valid'] = valid
    scored_results.append(item)

print(f"Scoring complete.")

In [None]:
# Filter and Save
final_clean = [r for r in scored_results if r['synthetic_valid']]
print(f"Pass Rate: {len(final_clean)}/{len(scored_results)} ({len(final_clean)/len(scored_results):.1%})")

with open("synthetic_clean_pairs.json", 'w') as f:
    json.dump(final_clean, f, indent=2)

print("Saved synthetic_clean_pairs.json")