# Lab 3.2.3 Solutions: AWQ Quantization

This notebook contains solutions to the exercises from Lab 3.2.3.

---

In [None]:
# Common imports
import torch
import numpy as np
import gc
import time
import os

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

## Exercise 1: Different Group Sizes

Compare AWQ quantization with different group sizes (32, 64, 128).

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

def compare_awq_group_sizes(
    model_id: str, 
    group_sizes: list = [32, 64, 128]
) -> dict:
    """
    Compare AWQ quantization with different group sizes.
    
    Args:
        model_id: HuggingFace model ID
        group_sizes: List of group sizes to compare
    
    Returns:
        Dictionary with results for each group size
    """
    results = {}
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Calibration data
    calibration_texts = [
        "Machine learning is a field of artificial intelligence.",
        "Deep learning uses neural networks with many layers.",
        "Natural language processing enables text understanding.",
        "Computer vision allows machines to interpret images.",
    ] * 32
    
    for gs in group_sizes:
        print(f"\nQuantizing with group_size={gs}...")
        
        clear_memory()
        
        awq_config = {
            "zero_point": True,
            "q_group_size": gs,
            "w_bit": 4,
            "version": "GEMM"
        }
        
        # Load model
        model = AutoAWQForCausalLM.from_pretrained(
            model_id, 
            device_map="cuda", 
            safetensors=True
        )
        
        # Quantize
        start = time.time()
        model.quantize(
            tokenizer, 
            quant_config=awq_config, 
            calib_data=calibration_texts
        )
        quant_time = time.time() - start
        
        # Save and measure size
        save_path = f"./awq_g{gs}"
        os.makedirs(save_path, exist_ok=True)
        model.save_quantized(save_path)
        tokenizer.save_pretrained(save_path)
        
        model_size = sum(
            os.path.getsize(os.path.join(save_path, f))
            for f in os.listdir(save_path)
            if f.endswith('.safetensors') or f.endswith('.bin')
        ) / 1e6
        
        results[gs] = {
            'size_mb': model_size,
            'quant_time': quant_time,
            'save_path': save_path
        }
        
        print(f"  Size: {model_size:.1f} MB")
        print(f"  Time: {quant_time:.1f}s")
        
        del model
        clear_memory()
    
    # Summary
    print("\n" + "="*50)
    print("AWQ Group Size Comparison")
    print("="*50)
    print(f"{'Group Size':>12} {'Size (MB)':>12} {'Time (s)':>12}")
    print("-"*50)
    for gs, data in results.items():
        print(f"{gs:>12} {data['size_mb']:>12.1f} {data['quant_time']:>12.1f}")
    
    return results

# Example usage:
# results = compare_awq_group_sizes("facebook/opt-350m", [32, 64, 128])
print("AWQ group size comparison function defined")

## Exercise 2: Task-Specific Evaluation

Use lm-eval to evaluate on specific benchmark tasks.

In [None]:
from lm_eval import evaluator

def evaluate_on_tasks(
    model_path: str, 
    tasks: list = ["hellaswag", "arc_easy"],
    batch_size: int = 8
) -> dict:
    """
    Evaluate a quantized model on specific benchmark tasks.
    
    Args:
        model_path: Path to the quantized model
        tasks: List of tasks to evaluate on
        batch_size: Batch size for evaluation
    
    Returns:
        Evaluation results dictionary
    """
    print(f"Evaluating {model_path} on tasks: {tasks}")
    print("This may take a while...")
    
    results = evaluator.simple_evaluate(
        model="hf",
        model_args=f"pretrained={model_path}",
        tasks=tasks,
        batch_size=batch_size,
        device="cuda"
    )
    
    # Print results
    print("\n" + "="*50)
    print("Evaluation Results")
    print("="*50)
    
    for task, scores in results['results'].items():
        # Try different accuracy keys
        acc = scores.get('acc', scores.get('acc_norm', scores.get('acc,none', 'N/A')))
        if isinstance(acc, float):
            print(f"{task}: {acc:.3f} ({acc*100:.1f}%)")
        else:
            print(f"{task}: {acc}")
    
    return results

# Example usage:
# results = evaluate_on_tasks("./awq_g128", ["hellaswag", "arc_easy"])
print("Task evaluation function defined")

In [None]:
# Complete comparison: AWQ vs GPTQ on benchmark tasks

def compare_quantization_quality(
    awq_path: str,
    gptq_path: str,
    tasks: list = ["hellaswag"]
) -> dict:
    """
    Compare AWQ and GPTQ on benchmark tasks.
    
    Args:
        awq_path: Path to AWQ quantized model
        gptq_path: Path to GPTQ quantized model
        tasks: Tasks to evaluate on
    
    Returns:
        Comparison results
    """
    results = {}
    
    for name, path in [('AWQ', awq_path), ('GPTQ', gptq_path)]:
        print(f"\nEvaluating {name}...")
        
        eval_results = evaluator.simple_evaluate(
            model="hf",
            model_args=f"pretrained={path}",
            tasks=tasks,
            batch_size=8,
            device="cuda"
        )
        
        results[name] = eval_results['results']
    
    # Print comparison
    print("\n" + "="*60)
    print("AWQ vs GPTQ Comparison")
    print("="*60)
    print(f"{'Task':<20} {'AWQ':>15} {'GPTQ':>15} {'Difference':>15}")
    print("-"*60)
    
    for task in tasks:
        awq_acc = results['AWQ'].get(task, {}).get('acc', 0)
        gptq_acc = results['GPTQ'].get(task, {}).get('acc', 0)
        diff = awq_acc - gptq_acc
        
        diff_str = f"+{diff:.3f}" if diff > 0 else f"{diff:.3f}"
        winner = "AWQ" if diff > 0 else "GPTQ" if diff < 0 else "Tie"
        
        print(f"{task:<20} {awq_acc:>15.3f} {gptq_acc:>15.3f} {diff_str:>15} ({winner})")
    
    return results

# Example usage:
# results = compare_quantization_quality("./awq_g128", "./gptq_g128", ["hellaswag"])
print("Comparison function defined")

---

## Key Takeaways

1. **AWQ protects salient weights** - Better quality than GPTQ in many cases
2. **Group size affects model size** - Smaller groups = slightly larger model
3. **lm-eval is the standard** - Use it for reproducible benchmarks
4. **Task-specific evaluation matters** - General benchmarks may not reflect your use case