# GriceBench Part 3: Production Baseline Comparisons

## Models Tested (Industry Standard):
| Model | Params | Why Include |
|-------|--------|-------------|
| **Mistral-7B-Instruct** | 7B | Top open-source, beats many 13B models |
| **Qwen2.5-7B-Instruct** | 7B | Alibaba's latest, excellent performance |
| **Llama-3.2-3B-Instruct** | 3B | Meta's latest efficient model |
| **Phi-3-mini** | 3.8B | Microsoft's strong reasoning |
| **Gemma-2-2B-it** | 2B | Google's latest efficient model |

## GPU Optimizations:
- 4-bit quantization (BitsAndBytes) for 7B models
- Flash Attention 2 when available
- Automatic memory management

**Estimated Runtime**: ~1.5-2 hours

## Setup:
1. Upload to Kaggle
2. Enable **GPU T4 x2**
3. Enable **Internet**
4. Run All

In [None]:
# Cell 1: Install Dependencies
print("Installing optimized dependencies...")
!pip install -q transformers>=4.40.0 accelerate>=0.27.0
!pip install -q bitsandbytes>=0.43.0
!pip install -q sentence-transformers
print("Done!")

In [None]:
# Cell 2: Configuration
import torch
import json
import gc
from pathlib import Path
from datetime import datetime
import numpy as np
from tqdm.auto import tqdm

CONFIG = {
    "num_samples": 150,
    "max_new_tokens": 200,
    "temperature": 0.7,
    "output_dir": "/kaggle/working/baseline_comparison",
}

BASELINES = {
    "mistral_7b": {"id": "mistralai/Mistral-7B-Instruct-v0.3", "4bit": True},
    "qwen2.5_7b": {"id": "Qwen/Qwen2.5-7B-Instruct", "4bit": True},
    "llama3.2_3b": {"id": "meta-llama/Llama-3.2-3B-Instruct", "4bit": False},
    "phi3_mini": {"id": "microsoft/Phi-3-mini-4k-instruct", "4bit": False},
    "gemma2_2b": {"id": "google/gemma-2-2b-it", "4bit": False},
}

Path(CONFIG["output_dir"]).mkdir(parents=True, exist_ok=True)
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"Testing {len(BASELINES)} models on {CONFIG['num_samples']} prompts")

In [None]:
# Cell 3: Test Prompts
TEST_PROMPTS = [
    {"context": "What is the capital of France?", "type": "factual"},
    {"context": "How many planets are in our solar system?", "type": "factual"},
    {"context": "Who wrote Romeo and Juliet?", "type": "factual"},
    {"context": "What year did World War II end?", "type": "factual"},
    {"context": "What is the chemical symbol for gold?", "type": "factual"},
    {"context": "What is the speed of light?", "type": "factual"},
    {"context": "Who discovered penicillin?", "type": "factual"},
    {"context": "What is the largest organ in the human body?", "type": "factual"},
    {"context": "Which planet has the most moons?", "type": "factual"},
    {"context": "What is the smallest country by area?", "type": "factual"},
    {"context": "What is the Pythagorean theorem?", "type": "factual"},
    {"context": "Who painted the Mona Lisa?", "type": "factual"},
    {"context": "What is the boiling point of water?", "type": "factual"},
    {"context": "What does DNA stand for?", "type": "factual"},
    {"context": "Who invented the telephone?", "type": "factual"},
    {"context": "How does photosynthesis work?", "type": "explanation"},
    {"context": "Why is the sky blue?", "type": "explanation"},
    {"context": "How do vaccines work?", "type": "explanation"},
    {"context": "What causes earthquakes?", "type": "explanation"},
    {"context": "How does the internet work?", "type": "explanation"},
    {"context": "Why do we dream?", "type": "explanation"},
    {"context": "How does a computer processor work?", "type": "explanation"},
    {"context": "What is machine learning?", "type": "explanation"},
    {"context": "How do airplanes fly?", "type": "explanation"},
    {"context": "Why do leaves change color?", "type": "explanation"},
    {"context": "How does electricity work?", "type": "explanation"},
    {"context": "What causes inflation?", "type": "explanation"},
    {"context": "How does GPS work?", "type": "explanation"},
    {"context": "How do black holes form?", "type": "explanation"},
    {"context": "What is blockchain technology?", "type": "explanation"},
    {"context": "How can I improve my sleep quality?", "type": "advice"},
    {"context": "What's a good way to learn a new language?", "type": "advice"},
    {"context": "How do I make friends in a new city?", "type": "advice"},
    {"context": "What's the best way to save money?", "type": "advice"},
    {"context": "How can I be more productive?", "type": "advice"},
    {"context": "How should I prepare for a job interview?", "type": "advice"},
    {"context": "What's a healthy diet look like?", "type": "advice"},
    {"context": "How can I reduce stress?", "type": "advice"},
    {"context": "What's the best way to learn to code?", "type": "advice"},
    {"context": "How do I negotiate a salary raise?", "type": "advice"},
    {"context": "How can I improve public speaking?", "type": "advice"},
    {"context": "What should I consider when buying a house?", "type": "advice"},
    {"context": "How do I start investing?", "type": "advice"},
    {"context": "How can I build better habits?", "type": "advice"},
    {"context": "What's a good exercise routine for beginners?", "type": "advice"},
    {"context": "What's your favorite food?", "type": "conversational"},
    {"context": "How was your day?", "type": "conversational"},
    {"context": "What do you like to do for fun?", "type": "conversational"},
    {"context": "Do you have any hobbies?", "type": "conversational"},
    {"context": "What kind of music do you enjoy?", "type": "conversational"},
    {"context": "Have you seen any good movies lately?", "type": "conversational"},
    {"context": "What's your opinion on remote work?", "type": "conversational"},
    {"context": "Do you prefer cats or dogs?", "type": "conversational"},
    {"context": "What's your dream vacation destination?", "type": "conversational"},
    {"context": "If you could have any superpower, what would it be?", "type": "conversational"},
    {"context": "What's your favorite book?", "type": "conversational"},
    {"context": "Do you prefer mornings or nights?", "type": "conversational"},
    {"context": "What's something you're grateful for?", "type": "conversational"},
    {"context": "What would you do if you won the lottery?", "type": "conversational"},
    {"context": "What's your favorite season and why?", "type": "conversational"},
    {"context": "What is the difference between Python and JavaScript?", "type": "technical"},
    {"context": "Explain object-oriented programming.", "type": "technical"},
    {"context": "What is a REST API?", "type": "technical"},
    {"context": "How do neural networks learn?", "type": "technical"},
    {"context": "What's the difference between SQL and NoSQL?", "type": "technical"},
    {"context": "Explain recursion in programming.", "type": "technical"},
    {"context": "What is version control and why is Git popular?", "type": "technical"},
    {"context": "What is containerization and Docker?", "type": "technical"},
    {"context": "What is Big O notation?", "type": "technical"},
    {"context": "How does HTTPS encryption work?", "type": "technical"},
    {"context": "Write a short poem about the ocean.", "type": "creative"},
    {"context": "Tell me a story about a robot learning emotions.", "type": "creative"},
    {"context": "Describe a perfect day.", "type": "creative"},
    {"context": "Write a haiku about autumn.", "type": "creative"},
    {"context": "Describe an imaginary planet.", "type": "creative"},
    {"context": "Write a motivational quote about perseverance.", "type": "creative"},
    {"context": "Describe the taste of your favorite food vividly.", "type": "creative"},
    {"context": "Describe what happiness looks like.", "type": "creative"},
    {"context": "Describe a futuristic city in year 3000.", "type": "creative"},
    {"context": "Write a letter to your future self.", "type": "creative"},
]

import random
random.seed(42)
while len(TEST_PROMPTS) < CONFIG["num_samples"]:
    TEST_PROMPTS.append(random.choice(TEST_PROMPTS[:80]).copy())
TEST_PROMPTS = TEST_PROMPTS[:CONFIG["num_samples"]]
print(f"Test prompts: {len(TEST_PROMPTS)}")

In [None]:
# Cell 4: Optimized Generator
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

def load_model(model_id, use_4bit=False):
    print(f"Loading {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    kwargs = {
        "trust_remote_code": True,
        "device_map": "auto",
        "torch_dtype": torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    }
    
    if use_4bit:
        kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
    
    model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
    model.eval()
    
    mem = torch.cuda.memory_allocated() / 1e9
    print(f"  Loaded! GPU Memory: {mem:.1f} GB")
    return model, tokenizer

def generate(model, tokenizer, context):
    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
        msgs = [{"role": "user", "content": context}]
        prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    else:
        prompt = f"User: {context}\nAssistant:"
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=CONFIG["max_new_tokens"],
            temperature=CONFIG["temperature"],
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    resp = tokenizer.decode(out[0], skip_special_tokens=True)
    if prompt in resp:
        resp = resp[len(prompt):].strip()
    return resp[:500]

def unload(model, tokenizer):
    del model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# Cell 5: Evaluator
from sentence_transformers import SentenceTransformer

print("Loading evaluator...")
encoder = SentenceTransformer('all-MiniLM-L6-v2')

def evaluate(context, response):
    emb = encoder.encode([context, response], normalize_embeddings=True)
    relevance = float(np.dot(emb[0], emb[1]))
    
    words = len(response.split())
    quantity = 1.0 if 15 <= words <= 150 else 0.5 if words < 15 else 0.7
    
    unique = len(set(response.lower().split())) / max(1, len(response.split()))
    clarity = min(1.0, unique * 1.2)
    
    overall = relevance * 0.4 + quantity * 0.3 + clarity * 0.3
    return {"relevance": relevance, "quantity": quantity, "clarity": clarity, "overall": overall}

print("Evaluator ready!")

In [None]:
# Cell 6: Run All Baselines
all_results = {}

for name, cfg in BASELINES.items():
    print(f"\n{'='*60}")
    print(f"Testing: {name}")
    print(f"{'='*60}")
    
    try:
        model, tokenizer = load_model(cfg["id"], cfg["4bit"])
        results = []
        
        for p in tqdm(TEST_PROMPTS, desc=name):
            try:
                resp = generate(model, tokenizer, p["context"])
                metrics = evaluate(p["context"], resp)
                results.append({"context": p["context"], "type": p["type"], "response": resp, "metrics": metrics})
            except Exception as e:
                print(f"Error: {e}")
        
        all_results[name] = results
        
        if results:
            print(f"\nResults for {name}:")
            print(f"  Relevance: {np.mean([r['metrics']['relevance'] for r in results]):.3f}")
            print(f"  Overall:   {np.mean([r['metrics']['overall'] for r in results]):.3f}")
        
        unload(model, tokenizer)
        
    except Exception as e:
        print(f"Failed to load {name}: {e}")
        import traceback
        traceback.print_exc()

print(f"\n{'='*60}")
print("ALL BASELINES COMPLETE!")
print(f"{'='*60}")

In [None]:
# Cell 7: Generate Report
summary = {}
for name, results in all_results.items():
    if results:
        summary[name] = {
            "n": len(results),
            "relevance": np.mean([r["metrics"]["relevance"] for r in results]),
            "quantity": np.mean([r["metrics"]["quantity"] for r in results]),
            "clarity": np.mean([r["metrics"]["clarity"] for r in results]),
            "overall": np.mean([r["metrics"]["overall"] for r in results]),
        }

report = f"""# Baseline Comparison Report

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}

## Overall Scores (Higher is Better)

| Model | Relevance | Quantity | Clarity | Overall | N |
|-------|-----------|----------|---------|---------|---|
"""

for name, s in sorted(summary.items(), key=lambda x: x[1]["overall"], reverse=True):
    report += f"| {name} | {s['relevance']:.3f} | {s['quantity']:.3f} | {s['clarity']:.3f} | **{s['overall']:.3f}** | {s['n']} |\n"

report += "\n## Best Model by Metric\n\n"
for m in ["relevance", "quantity", "clarity", "overall"]:
    best = max(summary.items(), key=lambda x: x[1][m])
    report += f"- **{m.capitalize()}**: {best[0]} ({best[1][m]:.3f})\n"

with open(f"{CONFIG['output_dir']}/baseline_comparison_report.md", "w") as f:
    f.write(report)

with open(f"{CONFIG['output_dir']}/baseline_results.json", "w") as f:
    json.dump({"summary": summary, "config": CONFIG}, f, indent=2, default=str)

print(report)

In [None]:
# Cell 8: Save All Responses
responses = {}
for name, results in all_results.items():
    responses[name] = [{"context": r["context"], "type": r["type"], "response": r["response"], 
                        "metrics": {k: float(v) for k,v in r["metrics"].items()}} for r in results]

with open(f"{CONFIG['output_dir']}/all_responses.json", "w") as f:
    json.dump(responses, f, indent=2)

print("\nFiles saved:")
for f in Path(CONFIG["output_dir"]).iterdir():
    print(f"  {f.name} ({f.stat().st_size/1024:.1f} KB)")

print("\n" + "="*60)
print("PART 3 COMPLETE!")
print("="*60)