# GriceBench Phase 2: MRR Evaluation & Relation Repair Assessment

This notebook evaluates the retrieval-based Relation repair system using:
1. **MRR (Mean Reciprocal Rank)** - per morechanges.md lines 775-812
2. **Relevance Scores** - semantic similarity metrics

## Required Datasets to Add:

Add these to your Kaggle notebook:

1. **gricebench-scientific-fix** (your private dataset):
   - `relation_eval_set.json` (from Phase 1)
   - `topical_corpus.json`
   - `repair_data/repair_test.json`

2. **sentence-transformers/all-MiniLM-L6-v2** (HuggingFace model)

Mount paths:
- `/kaggle/input/gricebench-scientific-fix/`
- `/kaggle/input/all-minilm-l6-v2/`

In [None]:
# Cell 1: Install dependencies
!pip install -q sentence-transformers

In [None]:
# Cell 2: Configuration
import os
import json
import numpy as np
import random
from pathlib import Path
from typing import Dict, List
import re

# Paths
DATA_INPUT = Path("/kaggle/input/gricebench-scientific-fix")
OUTPUT_DIR = Path("/kaggle/working")

# Check data
print("Checking datasets...")
for path in DATA_INPUT.iterdir():
    print(f"  - {path.name}")

# Model path (if using local model)
MODEL_PATH = "/kaggle/input/all-minilm-l6-v2"
if not Path(MODEL_PATH).exists():
    MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
    print(f"\nUsing HuggingFace model: {MODEL_PATH}")
else:
    print(f"\nUsing local model: {MODEL_PATH}")

In [None]:
# Cell 3: Load sentence encoder
from sentence_transformers import SentenceTransformer

print("Loading sentence encoder...")
encoder = SentenceTransformer(MODEL_PATH)
print("‚úÖ Encoder loaded")

In [None]:
# Cell 4: Load evaluation data

# Try to load relation eval set (from Phase 1)
eval_set_path = DATA_INPUT / "relation_eval_set.json"
if not eval_set_path.exists():
    # Fallback: create from repair_test.json
    print("relation_eval_set.json not found, creating from repair_test.json...")
    repair_test_path = DATA_INPUT / "repair_data" / "repair_test.json"
    if not repair_test_path.exists():
        repair_test_path = DATA_INPUT / "repair_test.json"
    
    with open(repair_test_path, 'r') as f:
        test_data = json.load(f)
    
    # Filter Relation violations
    relation_examples = []
    for i, item in enumerate(test_data):
        input_text = item.get("input_text", "")
        if "[VIOLATION=RELATION]" in input_text:
            example = {
                "id": f"relation_eval_{i}",
                "input_text": input_text,
                "target_text": item.get("target_text", ""),
            }
            context_match = re.search(r'\[CONTEXT\](.*?)\[', input_text, re.DOTALL)
            response_match = re.search(r'\[RESPONSE\](.*?)$', input_text, re.DOTALL)
            if context_match:
                example["context"] = context_match.group(1).strip()
            if response_match:
                example["response"] = response_match.group(1).strip()
            relation_examples.append(example)
    
    random.seed(42)
    eval_data = random.sample(relation_examples, min(200, len(relation_examples)))
    print(f"  Created {len(eval_data)} examples")
else:
    with open(eval_set_path, 'r') as f:
        eval_data = json.load(f)
    print(f"Loaded {len(eval_data)} examples from relation_eval_set.json")

# Load corpus
corpus_path = DATA_INPUT / "topical_corpus.json"
with open(corpus_path, 'r') as f:
    corpus = json.load(f)

# Extract responses
if isinstance(corpus[0], dict):
    corpus_responses = [item.get('response', str(item)) for item in corpus]
else:
    corpus_responses = corpus

print(f"Corpus size: {len(corpus_responses)}")

In [None]:
# Cell 5: Encode corpus (subsample for efficiency)

MAX_CORPUS = 10000
if len(corpus_responses) > MAX_CORPUS:
    random.seed(42)
    corpus_sample = random.sample(corpus_responses, MAX_CORPUS)
else:
    corpus_sample = corpus_responses

print(f"Encoding {len(corpus_sample)} corpus responses...")
corpus_embeddings = encoder.encode(
    corpus_sample,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=64
)
print(f"‚úÖ Corpus encoded: {corpus_embeddings.shape}")

## MRR Evaluation

Mean Reciprocal Rank measures how well the retrieval system finds relevant responses:
- For each context, retrieve top-10 from corpus
- Find rank of semantically similar response
- MRR = mean(1/rank)

In [None]:
# Cell 6: MRR Evaluation

print("=" * 70)
print("MRR EVALUATION")
print("=" * 70)

mrr_scores = []
top1_hits = 0
top3_hits = 0
top10_hits = 0

for i, item in enumerate(eval_data):
    if (i + 1) % 50 == 0:
        print(f"  Processed {i + 1}/{len(eval_data)}")
    
    # Get context
    context = item.get('context', '')
    if not context and 'input_text' in item:
        match = re.search(r'\[CONTEXT\](.*?)\[', item['input_text'], re.DOTALL)
        if match:
            context = match.group(1).strip()
    
    if not context:
        mrr_scores.append(0.0)
        continue
    
    # Get true response
    true_response = item.get('target_text', item.get('response', ''))
    
    # Encode context
    context_embedding = encoder.encode(
        [context],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0]
    
    # Find top-10 from corpus
    similarities = np.dot(corpus_embeddings, context_embedding)
    top_indices = np.argsort(similarities)[-10:][::-1]
    
    # Encode true response
    true_embedding = encoder.encode(
        [true_response],
        convert_to_numpy=True,
        normalize_embeddings=True
    )[0]
    
    # Find rank of similar response
    rank = None
    for j, idx in enumerate(top_indices):
        candidate_embedding = corpus_embeddings[idx]
        sim_to_true = np.dot(candidate_embedding, true_embedding)
        if sim_to_true > 0.7:  # Threshold for "relevant"
            rank = j + 1
            break
    
    if rank:
        mrr_scores.append(1.0 / rank)
        if rank == 1:
            top1_hits += 1
        if rank <= 3:
            top3_hits += 1
        if rank <= 10:
            top10_hits += 1
    else:
        mrr_scores.append(0.0)

# Calculate metrics
n = len(eval_data)
mrr = np.mean(mrr_scores)

results = {
    'mrr': float(mrr),
    'top1_accuracy': top1_hits / n if n > 0 else 0,
    'top3_accuracy': top3_hits / n if n > 0 else 0,
    'top10_accuracy': top10_hits / n if n > 0 else 0,
    'n_examples': n
}

print("\n" + "=" * 50)
print("MRR RESULTS")
print("=" * 50)
print(f"MRR:          {results['mrr']:.4f}")
print(f"Top-1:        {results['top1_accuracy']:.4f}")
print(f"Top-3:        {results['top3_accuracy']:.4f}")
print(f"Top-10:       {results['top10_accuracy']:.4f}")
print(f"Examples:     {results['n_examples']}")

In [None]:
# Cell 7: Interpretation & Verdict

print("\n" + "=" * 70)
print("VERDICT")
print("=" * 70)

if results['mrr'] >= 0.7:
    verdict = "EXCELLENT"
    action = "Retrieval system is working well. Proceed to Phase 3."
    emoji = "‚úÖ"
elif results['mrr'] >= 0.5:
    verdict = "ACCEPTABLE"
    action = "Retrieval acceptable but could improve. Consider better embeddings."
    emoji = "‚ö†Ô∏è"
else:
    verdict = "NEEDS IMPROVEMENT"
    action = "Retrieval below threshold. Recommend: use all-mpnet-base-v2 or larger corpus."
    emoji = "‚ùå"

print(f"\n{emoji} {verdict}")
print(f"\nRecommendation: {action}")

# Decision point per morechanges.md
print("\n" + "-" * 50)
print("Decision Point:")
if results['mrr'] >= 0.5:
    print("‚úÖ MRR >= 0.5: Proceed to Phase 3 (Annotation)")
else:
    print("‚ùå MRR < 0.5: Fix retrieval before proceeding")
    print("   Options:")
    print("   1. Use better encoder (all-mpnet-base-v2)")
    print("   2. Expand corpus with more examples")
    print("   3. Use GPT-2 fallback for Relation repair")

In [None]:
# Cell 8: Save results

output_path = OUTPUT_DIR / "relation_repair_mrr.json"
with open(output_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úÖ Results saved to {output_path}")
print("\nüì• Download this file for your records.")

## Sample Analysis

View some example retrievals to understand performance.

In [None]:
# Cell 9: View sample retrievals

print("=" * 70)
print("SAMPLE RETRIEVALS")
print("=" * 70)

# Show 5 random examples
sample_indices = random.sample(range(len(eval_data)), min(5, len(eval_data)))

for idx in sample_indices:
    item = eval_data[idx]
    context = item.get('context', '')
    if not context and 'input_text' in item:
        match = re.search(r'\[CONTEXT\](.*?)\[', item['input_text'], re.DOTALL)
        if match:
            context = match.group(1).strip()
    
    # Get top-3 retrievals
    context_embedding = encoder.encode([context], normalize_embeddings=True)[0]
    similarities = np.dot(corpus_embeddings, context_embedding)
    top_indices = np.argsort(similarities)[-3:][::-1]
    
    print(f"\n{'='*50}")
    print(f"CONTEXT: {context[:200]}..." if len(context) > 200 else f"CONTEXT: {context}")
    print(f"\nTOP-3 RETRIEVALS:")
    for rank, corpus_idx in enumerate(top_indices, 1):
        retrieved = corpus_sample[corpus_idx]
        sim = similarities[corpus_idx]
        print(f"  {rank}. [sim={sim:.3f}] {retrieved[:100]}...")

In [None]:
# Cell 10: Summary

print("\n" + "=" * 70)
print("PHASE 2 MRR EVALUATION COMPLETE")
print("=" * 70)

print(f"""
Summary:
--------
MRR Score: {results['mrr']:.4f}
Top-1 Accuracy: {results['top1_accuracy']:.2%}
Top-3 Accuracy: {results['top3_accuracy']:.2%}
Top-10 Accuracy: {results['top10_accuracy']:.2%}

Next Steps:
-----------
1. Download relation_repair_mrr.json
2. If MRR >= 0.5, proceed to human annotation (Phase 3)
3. If MRR < 0.5, run improvement notebook first
""")