# Task 15.1 Solution: LLM Benchmark Suite

This notebook provides solutions to the exercises in the benchmark suite notebook.

---

## Exercise 1 Solution: Compare Two Models

**Task:** Run a full benchmark comparison on two models of your choice.

In [None]:
import os
import gc
import json
import glob
import time
import subprocess
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Setup
NOTEBOOK_DIR = Path(os.getcwd())
RESULTS_DIR = str((NOTEBOOK_DIR / "../data/benchmark_results").resolve())
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Results directory: {RESULTS_DIR}")

In [None]:
def clear_memory() -> None:
    """Clear GPU memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def run_benchmark(
    model_name: str,
    tasks: list,
    output_name: str,
    batch_size: int = 8,
    limit: int = None,
    dtype: str = "bfloat16"
) -> dict:
    """
    Run lm-eval benchmark on a model.
    
    Args:
        model_name: HuggingFace model path
        tasks: List of benchmark tasks
        output_name: Name for output directory
        batch_size: Batch size for evaluation
        limit: Optional limit on number of samples
        dtype: Data type for model
    
    Returns:
        Dictionary of results or None if failed
    """
    clear_memory()
    
    output_path = f"{RESULTS_DIR}/{output_name}"
    tasks_str = ",".join(tasks)
    
    cmd = [
        "lm_eval",
        "--model", "hf",
        "--model_args", f"pretrained={model_name},dtype={dtype}",
        "--tasks", tasks_str,
        "--batch_size", str(batch_size),
        "--output_path", output_path
    ]
    
    if limit:
        cmd.extend(["--limit", str(limit)])
    
    print(f"\nüöÄ Starting evaluation of {model_name}")
    print(f"   Tasks: {tasks_str}")
    print(f"   Limit: {limit if limit else 'Full evaluation'}")
    
    start_time = time.time()
    result = subprocess.run(cmd, capture_output=True, text=True)
    elapsed = time.time() - start_time
    
    print(f"\n‚è±Ô∏è  Completed in {elapsed/60:.1f} minutes")
    
    if result.returncode != 0:
        print(f"\n‚ùå Error: {result.stderr}")
        return None
    
    # Load results
    result_files = glob.glob(f"{output_path}/*/results.json")
    if result_files:
        with open(result_files[0], 'r') as f:
            return json.load(f)
    return None

In [None]:
# Step 1: Define models to compare
# Comparing same-family models of different sizes
my_models = [
    {
        "name": "microsoft/phi-2",
        "size": "2.7B",
        "description": "Microsoft Phi-2 - compact but powerful"
    },
    {
        "name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        "size": "1.1B",
        "description": "TinyLlama - efficient small model"
    }
]

# Step 2: Define benchmarks (at least 3)
my_benchmarks = ["hellaswag", "arc_easy", "winogrande"]

print("üìã Models to compare:")
for m in my_models:
    print(f"  ‚Ä¢ {m['name']} ({m['size']}) - {m['description']}")

print(f"\nüìä Benchmarks: {my_benchmarks}")

In [None]:
# Step 3: Run evaluations
all_results = {}

for model_info in my_models:
    model_name = model_info['name']
    safe_name = model_name.replace('/', '_').replace('-', '_')
    
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name} ({model_info['size']})")
    print(f"{'='*60}")
    
    results = run_benchmark(
        model_name=model_name,
        tasks=my_benchmarks,
        output_name=f"{safe_name}_comparison",
        batch_size=8,
        limit=100  # Remove for full evaluation
    )
    
    if results:
        all_results[model_name] = results
    
    clear_memory()

In [None]:
# Step 4: Create comparison visualization
def create_comparison_table(results_dict: dict, benchmarks: list) -> pd.DataFrame:
    """Create a comparison DataFrame."""
    data = []
    
    for model_name, results in results_dict.items():
        row = {'Model': model_name.split('/')[-1]}
        task_results = results.get('results', {})
        
        for task_name, metrics in task_results.items():
            score = metrics.get('acc_norm', metrics.get('acc', 0))
            if isinstance(score, (int, float)):
                row[task_name] = score * 100
        
        scores = [v for k, v in row.items() if k != 'Model']
        row['Average'] = sum(scores) / len(scores) if scores else 0
        data.append(row)
    
    df = pd.DataFrame(data).set_index('Model')
    return df.round(2)

if all_results:
    comparison_df = create_comparison_table(all_results, my_benchmarks)
    print("\nüìä Model Comparison Table:")
    print(comparison_df.to_string())

In [None]:
# Step 5: Visualize results
if all_results and len(all_results) > 1:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar chart comparison
    x = np.arange(len(comparison_df.columns) - 1)
    width = 0.35
    colors = plt.cm.Set2(np.linspace(0, 1, len(comparison_df)))
    
    for i, (model, row) in enumerate(comparison_df.iterrows()):
        values = [row[col] for col in comparison_df.columns if col != 'Average']
        offset = width * (i - len(comparison_df)/2 + 0.5)
        axes[0].bar(x + offset, values, width, label=model, color=colors[i])
    
    axes[0].set_ylabel('Score (%)')
    axes[0].set_title('Benchmark Comparison')
    axes[0].set_xticks(x)
    axes[0].set_xticklabels([col for col in comparison_df.columns if col != 'Average'],
                           rotation=45, ha='right')
    axes[0].legend()
    axes[0].set_ylim(0, 100)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Average scores
    models = comparison_df.index.tolist()
    averages = comparison_df['Average'].values
    axes[1].barh(models, averages, color=colors[:len(models)])
    axes[1].set_xlabel('Average Score (%)')
    axes[1].set_title('Overall Average')
    axes[1].set_xlim(0, 100)
    axes[1].grid(axis='x', alpha=0.3)
    
    for i, v in enumerate(averages):
        axes[1].text(v + 1, i, f'{v:.1f}%', va='center')
    
    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/model_comparison.png", dpi=150)
    plt.show()
    
    print(f"\nüìÅ Chart saved to {RESULTS_DIR}/model_comparison.png")

In [None]:
# Step 6: Analysis
print("\nüìù Analysis:")
print("=" * 60)

if comparison_df is not None and len(comparison_df) >= 2:
    best_model = comparison_df['Average'].idxmax()
    best_score = comparison_df['Average'].max()
    
    print(f"\nüèÜ Best Model: {best_model}")
    print(f"   Average Score: {best_score:.2f}%")
    
    # Per-benchmark winner
    print("\nüìä Per-Benchmark Winners:")
    for col in comparison_df.columns:
        if col != 'Average':
            winner = comparison_df[col].idxmax()
            score = comparison_df[col].max()
            print(f"   {col}: {winner} ({score:.1f}%)")
    
    print("\nüìã Conclusion:")
    print(f"   {best_model} performs best overall, likely due to its larger")
    print(f"   parameter count and training data quality.")

---

## Key Takeaways

1. **Fair Comparison**: Both models evaluated with identical settings (0-shot, same benchmarks)
2. **Multiple Metrics**: Using 3+ benchmarks gives a more complete picture
3. **Visualization**: Charts make differences immediately clear
4. **Trade-offs**: Larger models often perform better but require more memory/time

In [None]:
# Cleanup
clear_memory()
print("‚úÖ Solution complete!")