# Task 11.6 Solutions: Quality Benchmark Suite

This notebook contains solutions to the exercises from Task 11.6.

---

In [None]:
# Common imports
import torch
import numpy as np
import pandas as pd
import gc
import time
import math
import os
from tqdm import tqdm
from typing import Dict, List, Optional
from dataclasses import dataclass, field

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Exercise 1: Benchmark a Larger Model

Run the full benchmark suite on Llama-2-7B or Mistral-7B.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

@dataclass
class BenchmarkResult:
    """Store benchmark results for a single model."""
    model_name: str
    quantization_type: str
    model_size_mb: float
    perplexity: Optional[float] = None
    tokens_per_second: Optional[float] = None
    memory_used_gb: Optional[float] = None
    task_scores: Dict[str, float] = field(default_factory=dict)
    
    def to_dict(self) -> dict:
        result = {
            'Model': self.model_name,
            'Quantization': self.quantization_type,
            'Size (MB)': self.model_size_mb,
            'Perplexity': self.perplexity,
            'Tokens/s': self.tokens_per_second,
            'Memory (GB)': self.memory_used_gb,
        }
        result.update(self.task_scores)
        return result


def benchmark_larger_model(
    model_id: str = "meta-llama/Llama-2-7b-hf",
    eval_texts: List[str] = None
) -> List[BenchmarkResult]:
    """
    Run comprehensive benchmark on a larger model.
    
    Args:
        model_id: HuggingFace model ID
        eval_texts: List of texts for perplexity evaluation
    
    Returns:
        List of BenchmarkResult for each precision
    """
    if eval_texts is None:
        eval_texts = [
            "The history of human civilization spans thousands of years.",
            "Machine learning algorithms learn patterns from data.",
            "The solar system contains eight planets orbiting the Sun.",
            "Climate change affects ecosystems around the world.",
            "Medical advances have improved human health outcomes.",
        ]
    
    results = []
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    def calc_perplexity(model):
        model.eval()
        total_loss = 0
        total_tokens = 0
        with torch.no_grad():
            for text in tqdm(eval_texts, desc="Perplexity", leave=False):
                enc = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
                input_ids = enc.input_ids.to(model.device)
                if input_ids.size(1) < 2:
                    continue
                outputs = model(input_ids, labels=input_ids)
                total_loss += outputs.loss.item() * (input_ids.size(1) - 1)
                total_tokens += input_ids.size(1) - 1
        return math.exp(total_loss / total_tokens)
    
    def calc_speed(model):
        inputs = tokenizer("Hello", return_tensors="pt").to(model.device)
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        
        torch.cuda.synchronize()
        start = time.perf_counter()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.pad_token_id)
        torch.cuda.synchronize()
        return 50 / (time.perf_counter() - start)
    
    def get_size(model):
        return sum(p.numel() * p.element_size() for p in model.parameters()) / 1e6
    
    # Configurations to test
    configs = [
        ('FP16', {'torch_dtype': torch.float16}),
        ('INT8', {'quantization_config': BitsAndBytesConfig(load_in_8bit=True)}),
        ('INT4/NF4', {'quantization_config': BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4"
        )}),
    ]
    
    for name, kwargs in configs:
        print(f"\n{'='*60}")
        print(f"Benchmarking {name}")
        print(f"{'='*60}")
        
        clear_memory()
        
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_id, device_map="cuda", **kwargs
            )
            
            size = get_size(model)
            memory = torch.cuda.memory_allocated() / 1e9
            
            print("Calculating perplexity...")
            ppl = calc_perplexity(model)
            
            print("Benchmarking speed...")
            speed = calc_speed(model)
            
            result = BenchmarkResult(
                model_name=model_id.split('/')[-1],
                quantization_type=name,
                model_size_mb=size,
                perplexity=ppl,
                tokens_per_second=speed,
                memory_used_gb=memory
            )
            results.append(result)
            
            print(f"\nResults:")
            print(f"  Size: {size:.1f} MB")
            print(f"  Memory: {memory:.2f} GB")
            print(f"  Perplexity: {ppl:.2f}")
            print(f"  Speed: {speed:.1f} tok/s")
            
            del model
            
        except Exception as e:
            print(f"Error: {e}")
        
        clear_memory()
    
    # Print summary
    print(f"\n{'='*80}")
    print("BENCHMARK SUMMARY")
    print(f"{'='*80}")
    df = pd.DataFrame([r.to_dict() for r in results])
    print(df.to_string(index=False))
    
    return results


# Example usage (uncomment to run):
# results = benchmark_larger_model("meta-llama/Llama-2-7b-hf")
# results = benchmark_larger_model("mistralai/Mistral-7B-v0.1")
print("Larger model benchmark function defined")

## Exercise 2: Task-Specific Evaluation

Use lm-eval to evaluate on specific tasks (HellaSwag, ARC, etc.).

In [None]:
from lm_eval import evaluator

def evaluate_on_tasks(
    model_path: str,
    tasks: List[str] = ["hellaswag", "arc_easy"],
    batch_size: int = 8,
    num_fewshot: int = 0
) -> Dict[str, float]:
    """
    Evaluate a model on specific benchmark tasks using lm-eval.
    
    Args:
        model_path: Path to model (HF model ID or local path)
        tasks: List of task names to evaluate on
        batch_size: Batch size for evaluation
        num_fewshot: Number of few-shot examples
    
    Returns:
        Dictionary mapping task names to accuracy scores
    """
    print(f"Evaluating {model_path} on tasks: {tasks}")
    print("This may take a while...")
    
    results = evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={model_path}",
        tasks=tasks,
        batch_size=batch_size,
        num_fewshot=num_fewshot,
        device="cuda"
    )
    
    # Extract scores
    scores = {}
    print(f"\n{'='*50}")
    print("Evaluation Results")
    print(f"{'='*50}")
    
    for task, task_results in results['results'].items():
        # Try different accuracy key names
        acc = None
        for key in ['acc', 'acc_norm', 'acc,none', 'accuracy']:
            if key in task_results:
                acc = task_results[key]
                break
        
        if acc is not None:
            scores[task] = acc
            print(f"{task}: {acc:.3f} ({acc*100:.1f}%)")
        else:
            print(f"{task}: Unable to extract accuracy")
    
    return scores


def compare_quantized_on_tasks(
    model_paths: Dict[str, str],
    tasks: List[str] = ["hellaswag"]
) -> pd.DataFrame:
    """
    Compare multiple quantized models on benchmark tasks.
    
    Args:
        model_paths: Dict mapping name to model path
        tasks: Tasks to evaluate on
    
    Returns:
        DataFrame with comparison results
    """
    all_results = []
    
    for name, path in model_paths.items():
        print(f"\n{'='*60}")
        print(f"Evaluating: {name}")
        print(f"{'='*60}")
        
        scores = evaluate_on_tasks(path, tasks)
        
        result = {'Model': name}
        result.update(scores)
        all_results.append(result)
        
        clear_memory()
    
    df = pd.DataFrame(all_results)
    
    # Print comparison
    print(f"\n{'='*60}")
    print("COMPARISON SUMMARY")
    print(f"{'='*60}")
    print(df.to_string(index=False))
    
    # Find best for each task
    print(f"\nBest models:")
    for task in tasks:
        if task in df.columns:
            best_idx = df[task].idxmax()
            best_model = df.loc[best_idx, 'Model']
            best_score = df.loc[best_idx, task]
            print(f"  {task}: {best_model} ({best_score:.3f})")
    
    return df


# Example usage (uncomment to run):
# scores = evaluate_on_tasks("facebook/opt-350m", ["hellaswag"])
#
# comparison = compare_quantized_on_tasks(
#     {
#         "FP16": "facebook/opt-350m",
#         "GPTQ": "./quantized_models/opt-350m-gptq-4bit-g128",
#         "AWQ": "./quantized_models/opt-350m-awq-4bit-g128"
#     },
#     tasks=["hellaswag", "arc_easy"]
# )
print("Task evaluation functions defined")

## Challenge: Automated Model Selection Pipeline

Create a function that automatically selects the best quantization based on constraints.

In [None]:
def select_best_model(
    model_id: str,
    max_size_mb: float = 1000,
    max_ppl_increase: float = 0.5,
    min_speed_tok_s: float = 20,
    eval_texts: List[str] = None
) -> dict:
    """
    Automatically select the best quantization method based on constraints.
    
    Args:
        model_id: HuggingFace model ID
        max_size_mb: Maximum acceptable model size
        max_ppl_increase: Maximum acceptable perplexity increase vs FP16
        min_speed_tok_s: Minimum acceptable generation speed
        eval_texts: Texts for perplexity evaluation
    
    Returns:
        Dict with recommendation and all results
    """
    if eval_texts is None:
        eval_texts = [
            "Machine learning is transforming industries worldwide.",
            "The neural network learns patterns from training data.",
            "Deep learning enables breakthrough applications in AI.",
            "Natural language processing understands human text.",
            "Computer vision interprets images and video content.",
        ]
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    results = []
    baseline_ppl = None
    
    def calc_ppl(model):
        model.eval()
        total_loss = 0
        total_tokens = 0
        with torch.no_grad():
            for text in eval_texts:
                enc = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
                input_ids = enc.input_ids.to(model.device)
                if input_ids.size(1) < 2:
                    continue
                outputs = model(input_ids, labels=input_ids)
                total_loss += outputs.loss.item() * (input_ids.size(1) - 1)
                total_tokens += input_ids.size(1) - 1
        return math.exp(total_loss / total_tokens)
    
    def calc_speed(model):
        inputs = tokenizer("Hello", return_tensors="pt").to(model.device)
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        torch.cuda.synchronize()
        start = time.perf_counter()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.pad_token_id)
        torch.cuda.synchronize()
        return 50 / (time.perf_counter() - start)
    
    # Configurations
    configs = [
        ('FP16', {}),
        ('INT8', {'quantization_config': BitsAndBytesConfig(load_in_8bit=True)}),
        ('INT4', {'quantization_config': BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
        )}),
    ]
    
    for name, kwargs in configs:
        print(f"\nTesting {name}...")
        clear_memory()
        
        try:
            if name == 'FP16':
                model = AutoModelForCausalLM.from_pretrained(
                    model_id, torch_dtype=torch.float16, device_map="cuda"
                )
            else:
                model = AutoModelForCausalLM.from_pretrained(
                    model_id, device_map="cuda", **kwargs
                )
            
            size = sum(p.numel() * p.element_size() for p in model.parameters()) / 1e6
            ppl = calc_ppl(model)
            speed = calc_speed(model)
            
            if baseline_ppl is None:
                baseline_ppl = ppl
            
            result = {
                'name': name,
                'size_mb': size,
                'perplexity': ppl,
                'ppl_delta': ppl - baseline_ppl,
                'speed': speed,
                'meets_constraints': (
                    size <= max_size_mb and
                    (ppl - baseline_ppl) <= max_ppl_increase and
                    speed >= min_speed_tok_s
                )
            }
            results.append(result)
            
            print(f"  Size: {size:.1f} MB")
            print(f"  PPL: {ppl:.2f} (delta: +{ppl - baseline_ppl:.2f})")
            print(f"  Speed: {speed:.1f} tok/s")
            print(f"  Meets constraints: {result['meets_constraints']}")
            
            del model
            
        except Exception as e:
            print(f"  Error: {e}")
    
    clear_memory()
    
    # Find best option
    valid = [r for r in results if r['meets_constraints']]
    
    if not valid:
        print("\n‚ö†Ô∏è  No model meets all constraints!")
        # Return the most compressed as fallback
        recommendation = min(results, key=lambda x: x['size_mb'])['name']
    else:
        # Prefer smallest that meets constraints
        valid.sort(key=lambda x: x['size_mb'])
        recommendation = valid[0]['name']
    
    print(f"\n{'='*60}")
    print(f"üèÜ RECOMMENDATION: {recommendation}")
    print(f"{'='*60}")
    
    return {
        'recommendation': recommendation,
        'results': results,
        'constraints': {
            'max_size_mb': max_size_mb,
            'max_ppl_increase': max_ppl_increase,
            'min_speed_tok_s': min_speed_tok_s
        }
    }


# Example usage (uncomment to run):
# result = select_best_model(
#     "facebook/opt-350m",
#     max_size_mb=500,
#     max_ppl_increase=1.0,
#     min_speed_tok_s=15
# )
print("Model selection pipeline defined")

In [None]:
# Extended version with GPTQ and AWQ support

def select_best_model_extended(
    model_id: str,
    gptq_path: str = None,
    awq_path: str = None,
    max_size_mb: float = 1000,
    max_ppl_increase: float = 0.5,
    min_speed_tok_s: float = 20,
    eval_texts: List[str] = None
) -> dict:
    """
    Extended model selection including GPTQ and AWQ models.
    
    Args:
        model_id: Base model HuggingFace ID
        gptq_path: Path to GPTQ quantized model
        awq_path: Path to AWQ quantized model
        max_size_mb: Maximum model size constraint
        max_ppl_increase: Maximum perplexity increase
        min_speed_tok_s: Minimum speed constraint
        eval_texts: Texts for perplexity evaluation
    
    Returns:
        Dict with recommendation and all results
    """
    if eval_texts is None:
        eval_texts = [
            "Machine learning is transforming industries.",
            "Neural networks learn from training data.",
            "Deep learning enables new applications.",
        ] * 5
    
    # First run basic benchmarks
    basic_result = select_best_model(
        model_id, max_size_mb, max_ppl_increase, min_speed_tok_s, eval_texts
    )
    
    results = basic_result['results'].copy()
    baseline_ppl = results[0]['perplexity']  # FP16 is first
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    def calc_ppl(model):
        model.eval()
        total_loss = 0
        total_tokens = 0
        with torch.no_grad():
            for text in eval_texts:
                enc = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
                input_ids = enc.input_ids.to(model.device)
                if input_ids.size(1) < 2:
                    continue
                outputs = model(input_ids, labels=input_ids)
                total_loss += outputs.loss.item() * (input_ids.size(1) - 1)
                total_tokens += input_ids.size(1) - 1
        return math.exp(total_loss / total_tokens)
    
    def calc_speed(model):
        inputs = tokenizer("Hello", return_tensors="pt").to(model.device)
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=10, do_sample=False)
        torch.cuda.synchronize()
        start = time.perf_counter()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.pad_token_id)
        torch.cuda.synchronize()
        return 50 / (time.perf_counter() - start)
    
    # Test GPTQ if available
    if gptq_path and os.path.exists(gptq_path):
        print(f"\nTesting GPTQ...")
        try:
            from auto_gptq import AutoGPTQForCausalLM
            clear_memory()
            
            model = AutoGPTQForCausalLM.from_quantized(
                gptq_path, device="cuda:0", use_safetensors=True
            )
            
            size = sum(
                os.path.getsize(os.path.join(gptq_path, f))
                for f in os.listdir(gptq_path)
                if f.endswith('.safetensors') or f.endswith('.bin')
            ) / 1e6
            ppl = calc_ppl(model)
            speed = calc_speed(model)
            
            results.append({
                'name': 'GPTQ',
                'size_mb': size,
                'perplexity': ppl,
                'ppl_delta': ppl - baseline_ppl,
                'speed': speed,
                'meets_constraints': (
                    size <= max_size_mb and
                    (ppl - baseline_ppl) <= max_ppl_increase and
                    speed >= min_speed_tok_s
                )
            })
            
            print(f"  Size: {size:.1f} MB")
            print(f"  PPL: {ppl:.2f}")
            print(f"  Speed: {speed:.1f} tok/s")
            
            del model
            clear_memory()
            
        except Exception as e:
            print(f"  GPTQ error: {e}")
    
    # Test AWQ if available
    if awq_path and os.path.exists(awq_path):
        print(f"\nTesting AWQ...")
        try:
            from awq import AutoAWQForCausalLM
            clear_memory()
            
            model = AutoAWQForCausalLM.from_quantized(awq_path, fuse_layers=True)
            
            size = sum(
                os.path.getsize(os.path.join(awq_path, f))
                for f in os.listdir(awq_path)
                if f.endswith('.safetensors') or f.endswith('.bin')
            ) / 1e6
            ppl = calc_ppl(model)
            speed = calc_speed(model)
            
            results.append({
                'name': 'AWQ',
                'size_mb': size,
                'perplexity': ppl,
                'ppl_delta': ppl - baseline_ppl,
                'speed': speed,
                'meets_constraints': (
                    size <= max_size_mb and
                    (ppl - baseline_ppl) <= max_ppl_increase and
                    speed >= min_speed_tok_s
                )
            })
            
            print(f"  Size: {size:.1f} MB")
            print(f"  PPL: {ppl:.2f}")
            print(f"  Speed: {speed:.1f} tok/s")
            
            del model
            clear_memory()
            
        except Exception as e:
            print(f"  AWQ error: {e}")
    
    # Find best
    valid = [r for r in results if r['meets_constraints']]
    
    if not valid:
        recommendation = min(results, key=lambda x: x['size_mb'])['name']
    else:
        valid.sort(key=lambda x: x['size_mb'])
        recommendation = valid[0]['name']
    
    # Summary table
    print(f"\n{'='*80}")
    print("COMPLETE BENCHMARK RESULTS")
    print(f"{'='*80}")
    print(f"{'Method':<10} {'Size (MB)':>12} {'PPL':>8} {'PPL Œî':>8} {'Speed':>10} {'Valid':>8}")
    print("-"*60)
    for r in results:
        valid_str = "‚úì" if r['meets_constraints'] else "‚úó"
        print(f"{r['name']:<10} {r['size_mb']:>12.1f} {r['perplexity']:>8.2f} {r['ppl_delta']:>+8.2f} {r['speed']:>10.1f} {valid_str:>8}")
    
    print(f"\nüèÜ RECOMMENDATION: {recommendation}")
    
    return {
        'recommendation': recommendation,
        'results': results,
        'constraints': {
            'max_size_mb': max_size_mb,
            'max_ppl_increase': max_ppl_increase,
            'min_speed_tok_s': min_speed_tok_s
        }
    }


# Example usage:
# result = select_best_model_extended(
#     "facebook/opt-350m",
#     gptq_path="./quantized_models/opt-350m-gptq-4bit-g128",
#     awq_path="./quantized_models/opt-350m-awq-4bit-g128",
#     max_size_mb=500
# )
print("Extended model selection pipeline defined")

---

## Key Takeaways

1. **Comprehensive benchmarking** - Test size, speed, quality, and task accuracy
2. **lm-eval is the standard** - Use it for reproducible benchmarks
3. **Constraints-based selection** - Define your priorities upfront
4. **Data-driven decisions** - Let the numbers guide your deployment choice