# Option E: Model Quality Comparison

This notebook provides tools to compare response quality across different model versions:
1. **Base Model** - Original pretrained model
2. **Fine-Tuned (BF16)** - After QLoRA training and merge
3. **INT4 ONNX** - Final browser-deployable version

## Objectives
- Generate responses from all model versions
- Compare quality systematically
- Verify INT4 quantization doesn't significantly degrade quality
- Document results for your technical report

## Setup

In [None]:
import json
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd

# For PyTorch models
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# For ONNX models
import onnxruntime as ort

## Configuration

In [None]:
@dataclass
class EvalConfig:
    """Configuration for model evaluation."""
    # Model paths - UPDATE THESE
    base_model_id: str = "google/gemma-3-1b-it"  # Original model
    finetuned_model_path: str = "./models/matcha-expert-merged"  # Merged BF16
    onnx_model_path: str = "./models/matcha-expert-onnx-int4"  # INT4 ONNX
    
    # System prompt
    system_prompt: str = """You are a matcha tea expert with deep knowledge of Japanese tea culture, 
preparation methods, health benefits, and culinary applications."""
    
    # Generation settings
    max_new_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.9

config = EvalConfig()
print("Configuration loaded.")

## Test Questions

Define a set of test questions that cover different aspects of your domain.

In [None]:
TEST_QUESTIONS = [
    # Basic questions
    "What is the difference between ceremonial and culinary grade matcha?",
    "How do I make traditional matcha with a bamboo whisk?",
    
    # Intermediate questions
    "What are the health benefits of matcha compared to regular green tea?",
    "How should I store matcha to keep it fresh?",
    
    # Advanced questions
    "Can you explain the different tea cultivars used for matcha and their characteristics?",
    "What is the significance of the tea ceremony in Japanese culture?",
    
    # Edge cases
    "Is it safe to drink matcha every day? Are there any side effects?",
    "How can I tell if my matcha has gone bad?",
]

print(f"Prepared {len(TEST_QUESTIONS)} test questions.")

## Model Loading Functions

In [None]:
def load_pytorch_model(model_path: str, device: str = "cuda"):
    """Load a PyTorch model for evaluation."""
    print(f"Loading model from {model_path}...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map=device,
    )
    model.eval()
    
    print(f"Model loaded on {device}")
    return model, tokenizer


def generate_pytorch_response(
    model, 
    tokenizer, 
    question: str, 
    system_prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
) -> str:
    """Generate a response using a PyTorch model."""
    # Format as chat
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the new tokens
    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1]:], 
        skip_special_tokens=True
    )
    
    return response.strip()

## Evaluation Data Structure

In [None]:
@dataclass
class EvalResult:
    """Store evaluation results for a single question."""
    question: str
    base_response: str = ""
    finetuned_response: str = ""
    onnx_response: str = ""
    base_score: Optional[int] = None  # 1-5 manual score
    finetuned_score: Optional[int] = None
    onnx_score: Optional[int] = None
    notes: str = ""


# Initialize results
eval_results = [EvalResult(question=q) for q in TEST_QUESTIONS]
print(f"Initialized {len(eval_results)} evaluation entries.")

## Step 1: Evaluate Base Model

In [None]:
# Load base model
base_model, base_tokenizer = load_pytorch_model(config.base_model_id)

In [None]:
# Generate responses from base model
print("Generating base model responses...\n")

for i, result in enumerate(eval_results):
    print(f"Question {i+1}/{len(eval_results)}: {result.question[:50]}...")
    
    response = generate_pytorch_response(
        base_model,
        base_tokenizer,
        result.question,
        config.system_prompt,
        config.max_new_tokens,
        config.temperature,
    )
    result.base_response = response
    print(f"  Response length: {len(response)} chars\n")

print("Base model evaluation complete!")

In [None]:
# Free memory
del base_model
torch.cuda.empty_cache()
print("Base model unloaded.")

## Step 2: Evaluate Fine-Tuned Model

In [None]:
# Load fine-tuned model
ft_model, ft_tokenizer = load_pytorch_model(config.finetuned_model_path)

In [None]:
# Generate responses from fine-tuned model
print("Generating fine-tuned model responses...\n")

for i, result in enumerate(eval_results):
    print(f"Question {i+1}/{len(eval_results)}: {result.question[:50]}...")
    
    response = generate_pytorch_response(
        ft_model,
        ft_tokenizer,
        result.question,
        config.system_prompt,
        config.max_new_tokens,
        config.temperature,
    )
    result.finetuned_response = response
    print(f"  Response length: {len(response)} chars\n")

print("Fine-tuned model evaluation complete!")

In [None]:
# Free memory
del ft_model
torch.cuda.empty_cache()
print("Fine-tuned model unloaded.")

## Step 3: Evaluate ONNX INT4 Model

Note: For accurate browser comparison, you should also test in an actual browser.
This section tests the ONNX model using onnxruntime in Python.

In [None]:
# Note: Full ONNX text generation requires more setup.
# For a quick comparison, you can use the transformers optimum library.

try:
    from optimum.onnxruntime import ORTModelForCausalLM
    
    print(f"Loading ONNX model from {config.onnx_model_path}...")
    onnx_model = ORTModelForCausalLM.from_pretrained(config.onnx_model_path)
    onnx_tokenizer = AutoTokenizer.from_pretrained(config.onnx_model_path)
    print("ONNX model loaded!")
    
except ImportError:
    print("optimum not installed. Install with: pip install optimum[onnxruntime]")
    print("Skipping ONNX evaluation in Python - test in browser instead.")
    onnx_model = None

In [None]:
if onnx_model is not None:
    print("Generating ONNX model responses...\n")
    
    for i, result in enumerate(eval_results):
        print(f"Question {i+1}/{len(eval_results)}: {result.question[:50]}...")
        
        # Format as chat
        messages = [
            {"role": "system", "content": config.system_prompt},
            {"role": "user", "content": result.question},
        ]
        prompt = onnx_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        inputs = onnx_tokenizer(prompt, return_tensors="pt")
        outputs = onnx_model.generate(
            **inputs,
            max_new_tokens=config.max_new_tokens,
            temperature=config.temperature,
            do_sample=True,
        )
        
        response = onnx_tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )
        result.onnx_response = response.strip()
        print(f"  Response length: {len(result.onnx_response)} chars\n")
    
    print("ONNX model evaluation complete!")
else:
    print("ONNX model not loaded. Fill in ONNX responses manually from browser testing.")

## View Results Side-by-Side

In [None]:
def display_comparison(result: EvalResult):
    """Display responses side-by-side for comparison."""
    print("=" * 80)
    print(f"QUESTION: {result.question}")
    print("=" * 80)
    
    print("\n--- BASE MODEL ---")
    print(result.base_response[:500] + "..." if len(result.base_response) > 500 else result.base_response)
    
    print("\n--- FINE-TUNED MODEL ---")
    print(result.finetuned_response[:500] + "..." if len(result.finetuned_response) > 500 else result.finetuned_response)
    
    print("\n--- ONNX INT4 MODEL ---")
    if result.onnx_response:
        print(result.onnx_response[:500] + "..." if len(result.onnx_response) > 500 else result.onnx_response)
    else:
        print("[Not evaluated - test in browser]")
    
    print("\n")

In [None]:
# Display all comparisons
for result in eval_results:
    display_comparison(result)

## Manual Scoring

Score each response from 1-5:
- **5**: Excellent - accurate, detailed, well-structured
- **4**: Good - correct with minor omissions
- **3**: Acceptable - mostly correct, some issues
- **2**: Poor - significant errors
- **1**: Unacceptable - incorrect or nonsensical

In [None]:
# Example: Score the first question
# Update these scores based on your evaluation

eval_results[0].base_score = 3  # Example score
eval_results[0].finetuned_score = 5  # Example score
eval_results[0].onnx_score = 5  # Example score
eval_results[0].notes = "Fine-tuned model provides more detailed matcha-specific information."

# Continue for all questions...
# eval_results[1].base_score = ...
# eval_results[1].finetuned_score = ...
# etc.

## Generate Summary Report

In [None]:
def generate_summary(results: list[EvalResult]) -> pd.DataFrame:
    """Generate a summary DataFrame of all results."""
    data = []
    for r in results:
        data.append({
            "Question": r.question[:50] + "...",
            "Base Score": r.base_score,
            "Fine-Tuned Score": r.finetuned_score,
            "ONNX Score": r.onnx_score,
            "Notes": r.notes[:30] + "..." if len(r.notes) > 30 else r.notes,
        })
    
    df = pd.DataFrame(data)
    return df


summary_df = generate_summary(eval_results)
print(summary_df.to_string())

In [None]:
# Calculate averages
scored_results = [r for r in eval_results if r.base_score is not None]

if scored_results:
    avg_base = sum(r.base_score for r in scored_results) / len(scored_results)
    avg_ft = sum(r.finetuned_score for r in scored_results if r.finetuned_score) / len([r for r in scored_results if r.finetuned_score])
    avg_onnx = sum(r.onnx_score for r in scored_results if r.onnx_score) / len([r for r in scored_results if r.onnx_score]) if any(r.onnx_score for r in scored_results) else None
    
    print("\n" + "=" * 40)
    print("AVERAGE SCORES")
    print("=" * 40)
    print(f"Base Model:      {avg_base:.2f}/5")
    print(f"Fine-Tuned:      {avg_ft:.2f}/5")
    if avg_onnx:
        print(f"ONNX INT4:       {avg_onnx:.2f}/5")
    print(f"\nImprovement:     +{((avg_ft - avg_base) / avg_base * 100):.1f}%")
else:
    print("No scores recorded yet. Complete the manual scoring section above.")

## Save Results

In [None]:
# Save results to JSON
results_data = [
    {
        "question": r.question,
        "base_response": r.base_response,
        "finetuned_response": r.finetuned_response,
        "onnx_response": r.onnx_response,
        "base_score": r.base_score,
        "finetuned_score": r.finetuned_score,
        "onnx_score": r.onnx_score,
        "notes": r.notes,
    }
    for r in eval_results
]

output_path = Path("./evaluation_results.json")
with open(output_path, "w") as f:
    json.dump(results_data, f, indent=2)

print(f"Results saved to {output_path}")

## Conclusions

Use this section to document your findings:

### Key Findings

1. **Fine-tuning effectiveness**: [Did fine-tuning improve domain responses?]

2. **Quantization impact**: [How much quality was lost with INT4?]

3. **Specific improvements**: [Which question types improved most?]

### Recommendations

- [Recommendation 1]
- [Recommendation 2]
- [Recommendation 3]