# Lab 2.5.1 Solution: Model Comparison Tool

This notebook provides the complete solution for the Challenge in Lab 2.5.1.

**Task**: Create a model comparison tool that:
1. Takes a task type as input
2. Finds the top 5 models for that task
3. Tests each model on the same inputs
4. Compares accuracy, speed, and memory usage
5. Generates a recommendation

---

In [None]:
import torch
from huggingface_hub import HfApi
from transformers import pipeline
import time
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import gc

print("Model Comparison Tool Solution")
print("=" * 60)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

DEVICE = 0 if torch.cuda.is_available() else -1

## Complete Model Comparison Tool

In [None]:
@dataclass
class ModelBenchmark:
    """Results from benchmarking a model."""
    model_id: str
    downloads: int
    load_time_seconds: float
    inference_time_ms: float
    memory_gb: float
    predictions: List[Dict]
    success: bool
    error: Optional[str] = None


class ModelComparisonTool:
    """
    Tool for comparing multiple models on the same task.
    
    Supports:
    - text-classification
    - ner
    - question-answering
    """
    
    def __init__(self, device: int = -1):
        self.device = device
        self.api = HfApi()
        
        # Default test inputs for each task
        self.default_inputs = {
            "text-classification": [
                "This product is absolutely amazing! Best purchase ever!",
                "Terrible quality. Complete waste of money.",
                "It's okay, does the job but nothing special."
            ],
            "ner": [
                "Apple CEO Tim Cook announced the new iPhone in Cupertino.",
                "Microsoft and Google compete in the cloud computing market."
            ],
            "question-answering": [
                {
                    "question": "What is the capital of France?",
                    "context": "Paris is the capital and largest city of France."
                }
            ]
        }
    
    def find_top_models(self, task: str, limit: int = 5) -> List[Dict]:
        """
        Find top models for a given task.
        
        Args:
            task: Pipeline task type
            limit: Maximum number of models
            
        Returns:
            List of model info dictionaries
        """
        print(f"\nSearching for top {limit} models for '{task}'...")
        
        models = self.api.list_models(
            filter=task,
            sort="downloads",
            direction=-1,
            limit=limit * 2  # Get more to filter
        )
        
        results = []
        for m in models:
            # Skip very large models and non-PyTorch
            if 'pytorch' in (m.library_name or '').lower() or m.library_name is None:
                results.append({
                    "id": m.id,
                    "downloads": m.downloads,
                    "likes": m.likes
                })
                if len(results) >= limit:
                    break
        
        print(f"Found {len(results)} models")
        return results
    
    def benchmark_model(
        self,
        model_id: str,
        task: str,
        test_inputs: List,
        downloads: int = 0,
        warmup_runs: int = 2
    ) -> ModelBenchmark:
        """
        Benchmark a single model.
        
        Args:
            model_id: HuggingFace model ID
            task: Pipeline task type
            test_inputs: List of test inputs
            downloads: Model download count
            warmup_runs: Number of warmup inferences
            
        Returns:
            ModelBenchmark with results
        """
        # Clear memory
        gc.collect()
        torch.cuda.empty_cache() if self.device >= 0 else None
        initial_memory = torch.cuda.memory_allocated() / 1e9 if self.device >= 0 else 0
        
        result = ModelBenchmark(
            model_id=model_id,
            downloads=downloads,
            load_time_seconds=0,
            inference_time_ms=0,
            memory_gb=0,
            predictions=[],
            success=False
        )
        
        try:
            # Load model
            print(f"  Loading {model_id}...", end=" ", flush=True)
            start = time.time()
            
            pipe = pipeline(
                task,
                model=model_id,
                device=self.device,
                torch_dtype=torch.bfloat16 if self.device >= 0 else torch.float32
            )
            
            result.load_time_seconds = time.time() - start
            result.memory_gb = (torch.cuda.memory_allocated() / 1e9 - initial_memory) if self.device >= 0 else 0
            print(f"loaded in {result.load_time_seconds:.1f}s")
            
            # Warmup
            for _ in range(warmup_runs):
                if task == "question-answering":
                    pipe(**test_inputs[0])
                else:
                    pipe(test_inputs[0])
            
            # Benchmark
            torch.cuda.synchronize() if self.device >= 0 else None
            start = time.time()
            
            for inp in test_inputs:
                if task == "question-answering":
                    pred = pipe(**inp)
                else:
                    pred = pipe(inp)
                result.predictions.append(pred)
            
            torch.cuda.synchronize() if self.device >= 0 else None
            result.inference_time_ms = (time.time() - start) * 1000 / len(test_inputs)
            result.success = True
            
            # Cleanup
            del pipe
            gc.collect()
            torch.cuda.empty_cache() if self.device >= 0 else None
            
        except Exception as e:
            print(f"FAILED: {e}")
            result.error = str(e)
        
        return result
    
    def compare_models(
        self,
        task: str,
        num_models: int = 5,
        test_inputs: Optional[List] = None
    ) -> List[ModelBenchmark]:
        """
        Compare top models for a task.
        
        Args:
            task: Pipeline task type
            num_models: Number of models to compare
            test_inputs: Custom test inputs (or use defaults)
            
        Returns:
            List of ModelBenchmark results
        """
        # Get test inputs
        if test_inputs is None:
            test_inputs = self.default_inputs.get(task, ["Test input"])
        
        # Find models
        models = self.find_top_models(task, num_models)
        
        # Benchmark each
        print(f"\nBenchmarking {len(models)} models...")
        print("-" * 60)
        
        results = []
        for model in models:
            benchmark = self.benchmark_model(
                model["id"],
                task,
                test_inputs,
                downloads=model["downloads"]
            )
            results.append(benchmark)
        
        return results
    
    def generate_report(self, results: List[ModelBenchmark]) -> str:
        """
        Generate a comparison report with recommendation.
        
        Args:
            results: List of benchmark results
            
        Returns:
            Formatted report string
        """
        successful = [r for r in results if r.success]
        
        if not successful:
            return "No models completed successfully."
        
        report = []
        report.append("\n" + "=" * 80)
        report.append("MODEL COMPARISON REPORT")
        report.append("=" * 80)
        
        # Table header
        report.append(f"\n{'Model':<45} {'Load(s)':<10} {'Infer(ms)':<12} {'Memory(GB)':<12}")
        report.append("-" * 80)
        
        # Sort by inference time
        sorted_results = sorted(successful, key=lambda x: x.inference_time_ms)
        
        for r in sorted_results:
            model_short = r.model_id[:44] if len(r.model_id) > 44 else r.model_id
            report.append(
                f"{model_short:<45} {r.load_time_seconds:<10.2f} "
                f"{r.inference_time_ms:<12.2f} {r.memory_gb:<12.2f}"
            )
        
        # Recommendations
        report.append("\n" + "=" * 80)
        report.append("RECOMMENDATIONS")
        report.append("=" * 80)
        
        # Fastest
        fastest = min(successful, key=lambda x: x.inference_time_ms)
        report.append(f"\n FASTEST: {fastest.model_id}")
        report.append(f"   Inference: {fastest.inference_time_ms:.2f}ms")
        
        # Smallest memory
        smallest = min(successful, key=lambda x: x.memory_gb if x.memory_gb > 0 else float('inf'))
        report.append(f"\n SMALLEST: {smallest.model_id}")
        report.append(f"   Memory: {smallest.memory_gb:.2f} GB")
        
        # Most popular (by downloads)
        most_popular = max(successful, key=lambda x: x.downloads)
        report.append(f"\n MOST POPULAR: {most_popular.model_id}")
        report.append(f"   Downloads: {most_popular.downloads:,}")
        
        # Overall recommendation (balance of speed and popularity)
        def score(r):
            # Lower is better for time, higher is better for downloads
            time_score = r.inference_time_ms / max(x.inference_time_ms for x in successful)
            pop_score = r.downloads / max(x.downloads for x in successful)
            return pop_score - time_score * 0.5  # Weight popularity more
        
        best_overall = max(successful, key=score)
        report.append(f"\n OVERALL RECOMMENDATION: {best_overall.model_id}")
        report.append(f"   Good balance of speed ({best_overall.inference_time_ms:.1f}ms) ")
        report.append(f"   and community trust ({best_overall.downloads:,} downloads)")
        
        report.append("\n" + "=" * 80)
        
        return "\n".join(report)

## Using the Tool

In [None]:
# Initialize
tool = ModelComparisonTool(device=DEVICE)

In [None]:
# Compare text classification models
print("\n" + "=" * 60)
print("Comparing TEXT CLASSIFICATION models")
print("=" * 60)

results = tool.compare_models(
    task="text-classification",
    num_models=5
)

print(tool.generate_report(results))

In [None]:
# Compare NER models
print("\n" + "=" * 60)
print("Comparing NER models")
print("=" * 60)

ner_results = tool.compare_models(
    task="ner",
    num_models=3  # Fewer for faster demo
)

print(tool.generate_report(ner_results))

In [None]:
# Custom test inputs
custom_inputs = [
    "I absolutely love this new feature!",
    "This update broke everything. Very disappointed.",
    "Works as expected, no complaints.",
    "Best software I've ever used!",
    "Buggy and slow. Needs improvement."
]

print("\n" + "=" * 60)
print("Comparing with CUSTOM test inputs")
print("=" * 60)

custom_results = tool.compare_models(
    task="text-classification",
    num_models=3,
    test_inputs=custom_inputs
)

print(tool.generate_report(custom_results))

## Cleanup

In [None]:
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")

print("\nSolution complete!")