# Lab 3.4.4: R1 vs Standard Model Comparison - SOLUTIONS

This notebook contains complete solutions to all exercises from Lab 3.4.4.

In [None]:
import ollama
import json
import re
import time
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, field
from collections import defaultdict
from pathlib import Path

# Model configuration
models = ollama.list()
model_names = [m['name'] for m in models.get('models', [])]

# Find models
R1_MODEL = None
STANDARD_MODEL = None

for name in model_names:
    if 'r1' in name.lower() and not R1_MODEL:
        R1_MODEL = name
    elif any(x in name.lower() for x in ['llama', 'qwen', 'mistral']) and 'r1' not in name.lower() and not STANDARD_MODEL:
        STANDARD_MODEL = name

if not R1_MODEL:
    R1_MODEL = model_names[0] if model_names else "deepseek-r1:7b"
if not STANDARD_MODEL:
    STANDARD_MODEL = model_names[0] if model_names else "qwen3:8b"

print(f"R1 Model: {R1_MODEL}")
print(f"Standard Model: {STANDARD_MODEL}")

## Solution: Evaluation Data Classes

In [None]:
@dataclass
class EvalResult:
    """Result from evaluating a single problem."""
    question: str
    expected: str
    predicted: str
    correct: bool
    response_time: float
    response_tokens: int
    thinking_tokens: int
    category: str
    full_response: str


@dataclass
class ModelEvaluation:
    """Complete evaluation of a model."""
    model_name: str
    results: List[EvalResult] = field(default_factory=list)
    
    @property
    def accuracy(self) -> float:
        if not self.results:
            return 0.0
        return sum(1 for r in self.results if r.correct) / len(self.results)
    
    @property
    def avg_time(self) -> float:
        if not self.results:
            return 0.0
        return sum(r.response_time for r in self.results) / len(self.results)
    
    @property
    def total_tokens(self) -> int:
        return sum(r.response_tokens for r in self.results)
    
    @property
    def total_thinking_tokens(self) -> int:
        return sum(r.thinking_tokens for r in self.results)
    
    def accuracy_by_category(self) -> Dict[str, float]:
        by_cat = defaultdict(list)
        for r in self.results:
            by_cat[r.category].append(r.correct)
        return {cat: sum(correct) / len(correct) for cat, correct in by_cat.items()}
    
    def time_by_category(self) -> Dict[str, float]:
        by_cat = defaultdict(list)
        for r in self.results:
            by_cat[r.category].append(r.response_time)
        return {cat: sum(times) / len(times) for cat, times in by_cat.items()}

## Solution: Answer Extraction and Comparison

In [None]:
def extract_answer(response: str, expected_type: str = "number") -> Optional[str]:
    """
    Solution: Extract answer from response based on expected type.
    """
    # Remove thinking tokens
    response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    response = response.strip()
    
    if expected_type == "number":
        patterns = [
            r"[Tt]he (?:final )?answer is[:\s]+\$?([\d,]+(?:\.\d+)?)",
            r"[Aa]nswer[:\s]+\$?([\d,]+(?:\.\d+)?)",
            r"=\s*\$?([\d,]+(?:\.\d+)?)\s*(?:$|\.|\n)",
            r"(?:result|total|sum)[:\s]+\$?([\d,]+(?:\.\d+)?)",
        ]
        for pattern in patterns:
            matches = re.findall(pattern, response)
            if matches:
                num_str = matches[-1].replace(',', '')
                try:
                    num = float(num_str)
                    return str(int(num)) if num == int(num) else str(num)
                except:
                    continue
        
        # Fallback: last number
        numbers = re.findall(r'-?[\d,]+(?:\.\d+)?', response)
        if numbers:
            num_str = numbers[-1].replace(',', '')
            try:
                num = float(num_str)
                return str(int(num)) if num == int(num) else str(num)
            except:
                pass
    
    elif expected_type == "yes_no":
        response_lower = response.lower()
        if 'yes' in response_lower:
            return 'yes'
        elif 'no' in response_lower:
            return 'no'
    
    elif expected_type == "multiple_choice":
        match = re.search(r'\b([A-D])\b', response)
        if match:
            return match.group(1).upper()
    
    return None


def count_thinking_tokens(response: str) -> int:
    """Count tokens in <think> blocks."""
    matches = re.findall(r'<think>(.*?)</think>', response, re.DOTALL)
    thinking_text = ' '.join(matches)
    return len(thinking_text) // 4


def compare_answers(predicted: Optional[str], expected, tolerance: float = 0.01) -> bool:
    """Compare predicted to expected answer."""
    if predicted is None:
        return False
    
    # Try numeric comparison
    try:
        pred_num = float(str(predicted).replace(',', ''))
        exp_num = float(str(expected).replace(',', ''))
        if exp_num == 0:
            return abs(pred_num) < tolerance
        return abs(pred_num - exp_num) / abs(exp_num) < tolerance
    except:
        pass
    
    # String comparison
    return str(predicted).lower().strip() == str(expected).lower().strip()

## Solution: Complete Model Evaluation Function

In [None]:
def evaluate_model(
    model: str,
    problems: Dict[str, List],
    n_per_category: int = 5,
    use_cot: bool = True,
    verbose: bool = True,
) -> ModelEvaluation:
    """
    Solution: Comprehensive model evaluation.
    """
    evaluation = ModelEvaluation(model_name=model)
    
    for category, probs in problems.items():
        if category == 'code':  # Skip code (needs execution to evaluate)
            continue
        
        if verbose:
            print(f"\n{'='*50}")
            print(f"Category: {category.upper()}")
            print('='*50)
        
        for i, prob in enumerate(probs[:n_per_category]):
            question = prob.get('question', '')
            expected = prob.get('answer', prob.get('numerical_answer', ''))
            
            # Determine expected type
            if str(expected).lower() in ['yes', 'no']:
                exp_type = 'yes_no'
            elif str(expected).upper() in ['A', 'B', 'C', 'D']:
                exp_type = 'multiple_choice'
            else:
                exp_type = 'number'
            
            if verbose:
                print(f"\nProblem {i+1}: {question[:60]}...")
            
            # Build prompt
            if use_cot:
                prompt = f"{question}\n\nLet's think step by step:"
            else:
                prompt = question
            
            # Get response
            start_time = time.time()
            response = ollama.chat(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": 0.0, "num_predict": 1024}
            )
            elapsed = time.time() - start_time
            
            response_text = response['message']['content']
            
            # Extract and compare
            predicted = extract_answer(response_text, exp_type)
            correct = compare_answers(predicted, expected)
            thinking_tokens = count_thinking_tokens(response_text)
            response_tokens = len(response_text) // 4
            
            result = EvalResult(
                question=question,
                expected=str(expected),
                predicted=predicted or "N/A",
                correct=correct,
                response_time=elapsed,
                response_tokens=response_tokens,
                thinking_tokens=thinking_tokens,
                category=category,
                full_response=response_text,
            )
            evaluation.results.append(result)
            
            if verbose:
                status = "CORRECT" if correct else "WRONG"
                print(f"  Expected: {expected}, Predicted: {predicted} [{status}]")
                print(f"  Time: {elapsed:.1f}s, Tokens: {response_tokens}")
    
    return evaluation

## Solution: Detailed Comparison Report

In [None]:
def print_detailed_comparison(
    r1_eval: ModelEvaluation,
    std_eval: ModelEvaluation
):
    """
    Solution: Generate detailed comparison report.
    """
    print("\n" + "=" * 70)
    print("COMPREHENSIVE MODEL COMPARISON REPORT")
    print("=" * 70)
    
    # Overall metrics
    print(f"\n{'Metric':<30} {'R1 Model':<18} {'Standard Model':<18}")
    print("-" * 70)
    print(f"{'Model Name':<30} {r1_eval.model_name[:16]:<18} {std_eval.model_name[:16]:<18}")
    print(f"{'Overall Accuracy':<30} {r1_eval.accuracy:<18.1%} {std_eval.accuracy:<18.1%}")
    print(f"{'Avg Response Time':<30} {r1_eval.avg_time:<18.1f}s {std_eval.avg_time:<18.1f}s")
    print(f"{'Total Tokens':<30} {r1_eval.total_tokens:<18} {std_eval.total_tokens:<18}")
    print(f"{'Thinking Tokens':<30} {r1_eval.total_thinking_tokens:<18} {std_eval.total_thinking_tokens:<18}")
    
    # Accuracy by category
    print("\n" + "-" * 70)
    print("ACCURACY BY CATEGORY")
    print("-" * 70)
    
    r1_by_cat = r1_eval.accuracy_by_category()
    std_by_cat = std_eval.accuracy_by_category()
    all_cats = set(r1_by_cat.keys()) | set(std_by_cat.keys())
    
    print(f"{'Category':<20} {'R1':<15} {'Standard':<15} {'Diff':<15}")
    for cat in sorted(all_cats):
        r1_acc = r1_by_cat.get(cat, 0)
        std_acc = std_by_cat.get(cat, 0)
        diff = r1_acc - std_acc
        sign = "+" if diff > 0 else ""
        print(f"{cat:<20} {r1_acc:<15.1%} {std_acc:<15.1%} {sign}{diff:<15.1%}")
    
    # Latency by category
    print("\n" + "-" * 70)
    print("LATENCY BY CATEGORY")
    print("-" * 70)
    
    r1_time = r1_eval.time_by_category()
    std_time = std_eval.time_by_category()
    
    print(f"{'Category':<20} {'R1':<15} {'Standard':<15} {'Ratio':<15}")
    for cat in sorted(all_cats):
        r1_t = r1_time.get(cat, 0)
        std_t = std_time.get(cat, 0)
        ratio = r1_t / std_t if std_t > 0 else 0
        print(f"{cat:<20} {r1_t:<15.2f}s {std_t:<15.2f}s {ratio:<15.1f}x")
    
    # Cost-benefit analysis
    print("\n" + "-" * 70)
    print("COST-BENEFIT ANALYSIS")
    print("-" * 70)
    
    acc_improvement = r1_eval.accuracy - std_eval.accuracy
    time_overhead = r1_eval.avg_time / std_eval.avg_time if std_eval.avg_time > 0 else 0
    token_overhead = r1_eval.total_tokens / std_eval.total_tokens if std_eval.total_tokens > 0 else 0
    
    print(f"Accuracy improvement:   {acc_improvement:+.1%}")
    print(f"Time overhead:          {time_overhead:.1f}x")
    print(f"Token overhead:         {token_overhead:.1f}x")
    
    # Efficiency metrics
    r1_correct = sum(1 for r in r1_eval.results if r.correct)
    std_correct = sum(1 for r in std_eval.results if r.correct)
    extra_correct = r1_correct - std_correct
    extra_time = sum(r.response_time for r in r1_eval.results) - sum(r.response_time for r in std_eval.results)
    
    print(f"\nExtra correct answers:  {extra_correct}")
    if extra_correct > 0:
        print(f"Time cost per extra:    {extra_time / extra_correct:.1f}s")
    
    # Recommendation
    print("\n" + "-" * 70)
    print("RECOMMENDATION")
    print("-" * 70)
    
    if acc_improvement > 0.15:
        print(f"R1 shows strong improvement (+{acc_improvement:.0%}).")
        print("RECOMMENDED for: complex reasoning, math, coding tasks")
    elif acc_improvement > 0.05:
        print(f"R1 shows moderate improvement (+{acc_improvement:.0%}).")
        print("RECOMMENDED for: complex queries; use standard for simple ones")
    elif acc_improvement > 0:
        print(f"R1 shows slight improvement (+{acc_improvement:.0%}).")
        print("Consider cost vs benefit; routing may help")
    else:
        print("Standard model matches or beats R1 on this benchmark.")
        print("Review test cases; R1 may still help on harder problems")
    
    print("=" * 70)

## Solution: Error Analysis

In [None]:
def analyze_errors(
    r1_eval: ModelEvaluation,
    std_eval: ModelEvaluation,
    show_examples: int = 3
):
    """
    Solution: Detailed error analysis comparing models.
    """
    print("\n" + "=" * 70)
    print("ERROR ANALYSIS")
    print("=" * 70)
    
    r1_results = {r.question: r for r in r1_eval.results}
    std_results = {r.question: r for r in std_eval.results}
    
    # Categorize problems
    both_correct = []
    both_wrong = []
    r1_only = []
    std_only = []
    
    for q in r1_results:
        if q in std_results:
            r1_c = r1_results[q].correct
            std_c = std_results[q].correct
            
            if r1_c and std_c:
                both_correct.append(q)
            elif not r1_c and not std_c:
                both_wrong.append(q)
            elif r1_c:
                r1_only.append(q)
            else:
                std_only.append(q)
    
    total = len(r1_results)
    
    print(f"\nAgreement Analysis ({total} problems):")
    print(f"  Both correct:      {len(both_correct):3} ({len(both_correct)/total:.0%})")
    print(f"  Both wrong:        {len(both_wrong):3} ({len(both_wrong)/total:.0%})")
    print(f"  R1 only correct:   {len(r1_only):3} ({len(r1_only)/total:.0%})")
    print(f"  Std only correct:  {len(std_only):3} ({len(std_only)/total:.0%})")
    
    # Show examples
    if r1_only:
        print("\n" + "-" * 50)
        print("R1 SUCCEEDED, Standard FAILED:")
        print("-" * 50)
        for q in r1_only[:show_examples]:
            r1_r = r1_results[q]
            std_r = std_results[q]
            print(f"\nQ: {q[:70]}...")
            print(f"  Expected: {r1_r.expected}")
            print(f"  R1: {r1_r.predicted} (CORRECT)")
            print(f"  Standard: {std_r.predicted} (WRONG)")
    
    if std_only:
        print("\n" + "-" * 50)
        print("Standard SUCCEEDED, R1 FAILED:")
        print("-" * 50)
        for q in std_only[:show_examples]:
            r1_r = r1_results[q]
            std_r = std_results[q]
            print(f"\nQ: {q[:70]}...")
            print(f"  Expected: {r1_r.expected}")
            print(f"  R1: {r1_r.predicted} (WRONG)")
            print(f"  Standard: {std_r.predicted} (CORRECT)")
    
    if both_wrong:
        print("\n" + "-" * 50)
        print("BOTH FAILED (hardest problems):")
        print("-" * 50)
        for q in both_wrong[:show_examples]:
            r1_r = r1_results[q]
            print(f"\nQ: {q[:70]}...")
            print(f"  Expected: {r1_r.expected}")
            print(f"  R1: {r1_results[q].predicted}")
            print(f"  Std: {std_results[q].predicted}")

## Solution: Run Full Comparison

In [None]:
# Sample problems for testing
test_problems = {
    "math": [
        {"question": "What is 17 * 23?", "answer": 391},
        {"question": "What is 15% of 240?", "answer": 36},
        {"question": "If 3x + 7 = 22, what is x?", "answer": 5},
    ],
    "reasoning": [
        {"question": "A bat and ball cost $1.10. The bat costs $1 more than the ball. How much does the ball cost in cents?", "answer": 5},
        {"question": "Tom is taller than Jim. Jim is taller than Mary. Is Tom taller than Mary? Answer yes or no.", "answer": "yes"},
    ]
}

# Run evaluations (uncomment to execute)
# print("Evaluating R1 Model...")
# r1_eval = evaluate_model(R1_MODEL, test_problems, n_per_category=3)
# 
# print("\nEvaluating Standard Model...")
# std_eval = evaluate_model(STANDARD_MODEL, test_problems, n_per_category=3)
# 
# print_detailed_comparison(r1_eval, std_eval)
# analyze_errors(r1_eval, std_eval)

print("Uncomment the evaluation code to run the comparison.")

## Key Takeaways

1. **Fair comparison**: Match model sizes (7B vs 8B, 70B vs 70B)
2. **Use CoT for both**: Give standard model the same prompting advantage
3. **Measure multiple metrics**: Accuracy, latency, tokens
4. **Analyze by category**: R1 may excel at some types more than others
5. **Error analysis**: Understand where each model fails