# Task 11.1 Solutions: Quantization Overview

This notebook contains solutions to the exercises from Task 11.1.

---

In [None]:
# Common imports
import torch
import numpy as np
import gc
import time
import math
from tqdm import tqdm

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

## Exercise 1: Asymmetric Quantization

Implement asymmetric quantization with zero-point for tensors with non-symmetric distributions.

In [None]:
def asymmetric_quantize(tensor: torch.Tensor, bits: int = 8) -> tuple:
    """
    Asymmetric quantization with zero-point.
    
    This is useful for tensors with non-symmetric distributions,
    like activations after ReLU.
    
    Args:
        tensor: Input tensor to quantize
        bits: Number of bits for quantization
    
    Returns:
        Tuple of (quantized tensor, scale, zero_point)
    """
    qmax = 2 ** bits - 1  # 255 for 8-bit
    qmin = 0
    
    # Find min and max
    min_val = tensor.min()
    max_val = tensor.max()
    
    # Compute scale
    scale = (max_val - min_val) / (qmax - qmin)
    scale = max(scale, 1e-10)  # Avoid division by zero
    
    # Compute zero point
    zero_point = round(-min_val / scale)
    zero_point = max(qmin, min(qmax, zero_point))  # Clamp
    
    # Quantize
    quantized = torch.round(tensor / scale + zero_point).clamp(qmin, qmax)
    
    return quantized.to(torch.uint8), scale, zero_point


def asymmetric_dequantize(quantized: torch.Tensor, scale: float, zero_point: int) -> torch.Tensor:
    """Dequantize asymmetric quantized tensor."""
    return (quantized.float() - zero_point) * scale


# Test with non-symmetric data (e.g., ReLU output)
test_tensor = torch.tensor([0.0, 0.5, 1.0, 1.5, 2.0])
q, s, zp = asymmetric_quantize(test_tensor)

print("Original:", test_tensor.numpy())
print(f"Quantized: {q.numpy()}, Scale: {s:.4f}, Zero point: {zp}")

# Verify reconstruction
reconstructed = asymmetric_dequantize(q, s, zp)
error = (test_tensor - reconstructed).abs().mean()
print(f"Reconstructed: {reconstructed.numpy()}")
print(f"Mean error: {error:.6f}")

In [None]:
# Compare symmetric vs asymmetric on biased data
biased_tensor = torch.rand(1000) * 2  # All positive [0, 2]

# Symmetric quantization
def symmetric_quantize(tensor: torch.Tensor, bits: int = 8) -> tuple:
    """Symmetric quantization around zero."""
    qmax = 2 ** (bits - 1) - 1
    scale = tensor.abs().max() / qmax
    quantized = torch.round(tensor / scale).clamp(-qmax-1, qmax)
    return quantized.to(torch.int8), scale

sym_q, sym_s = symmetric_quantize(biased_tensor)
sym_recon = sym_q.float() * sym_s
sym_error = (biased_tensor - sym_recon).abs().mean()

# Asymmetric quantization
asym_q, asym_s, asym_zp = asymmetric_quantize(biased_tensor)
asym_recon = asymmetric_dequantize(asym_q, asym_s, asym_zp)
asym_error = (biased_tensor - asym_recon).abs().mean()

print(f"Symmetric error:  {sym_error:.6f}")
print(f"Asymmetric error: {asym_error:.6f}")
print(f"\nAsymmetric is {sym_error/asym_error:.1f}x better for biased data!")

## Exercise 2: Larger Model Comparison

Compare FP16, INT8, and INT4 on a larger model (Llama-2-7B or OPT-1.3B).

In [None]:
# Solution for larger model (Llama-2-7B or OPT-1.3B)
# Note: Llama-2 requires HuggingFace login and model access

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Choose your model:
# model_id = "meta-llama/Llama-2-7b-hf"  # Requires HF login
model_id = "facebook/opt-1.3b"  # No login required

def compare_precisions(model_id: str):
    """Compare memory usage across different precisions."""
    results = {}
    
    # FP16
    print("Loading FP16...")
    clear_memory()
    model_fp16 = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=torch.float16, device_map="cuda"
    )
    results['FP16'] = torch.cuda.memory_allocated() / 1e9
    print(f"  FP16: {results['FP16']:.2f} GB")
    del model_fp16
    clear_memory()
    
    # INT8
    print("Loading INT8...")
    model_int8 = AutoModelForCausalLM.from_pretrained(
        model_id, 
        quantization_config=BitsAndBytesConfig(load_in_8bit=True),
        device_map="cuda"
    )
    results['INT8'] = torch.cuda.memory_allocated() / 1e9
    print(f"  INT8: {results['INT8']:.2f} GB")
    del model_int8
    clear_memory()
    
    # INT4
    print("Loading INT4...")
    model_int4 = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
        device_map="cuda"
    )
    results['INT4'] = torch.cuda.memory_allocated() / 1e9
    print(f"  INT4: {results['INT4']:.2f} GB")
    del model_int4
    clear_memory()
    
    # Summary
    print("\n" + "="*50)
    print("Memory Comparison Summary")
    print("="*50)
    for name, mem in results.items():
        compression = results['FP16'] / mem
        print(f"{name}: {mem:.2f} GB ({compression:.1f}x compression)")
    
    return results

# Uncomment to run:
# results = compare_precisions(model_id)
print("Run compare_precisions() to test with your chosen model")

---

## Key Takeaways

1. **Asymmetric quantization** is better for non-symmetric distributions (like ReLU outputs)
2. **Symmetric quantization** is simpler and faster, good for weights
3. **Memory savings** scale roughly linearly with bit reduction
4. **bitsandbytes** makes INT8/INT4 quantization easy on HuggingFace models