# Lab 3.2.2 Solutions: GPTQ Quantization

This notebook contains solutions to the exercises from Lab 3.2.2.

---

In [None]:
# Common imports
import torch
import numpy as np
import gc
import time
import os

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

## Exercise 1: Quantize a Larger Model

Quantize OPT-1.3B or Llama-2-7B with different group sizes.

In [None]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer

def quantize_model_gptq(
    model_id: str,
    group_size: int = 128,
    bits: int = 4,
    calibration_samples: int = 128
) -> str:
    """
    Quantize any model with GPTQ.
    
    Works with: OPT, Llama, Mistral, etc.
    
    Args:
        model_id: HuggingFace model ID
        group_size: GPTQ group size (32, 64, 128)
        bits: Quantization bits (typically 4)
        calibration_samples: Number of calibration samples
    
    Returns:
        Path to saved quantized model
    """
    print(f"Quantizing {model_id} with GPTQ (bits={bits}, group_size={group_size})...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Create calibration data
    calibration_texts = [
        "Machine learning is transforming technology and industry.",
        "The neural network learns patterns from training data.",
        "Deep learning requires large datasets and computational power.",
        "Artificial intelligence systems can perform complex tasks.",
        "Natural language processing enables computers to understand text.",
        "Computer vision allows machines to interpret visual information.",
        "Reinforcement learning agents learn through trial and error.",
        "Transfer learning leverages pre-trained model knowledge.",
    ] * ((calibration_samples // 8) + 1)
    
    calibration_data = [
        tokenizer.encode(text, truncation=True, max_length=512)
        for text in calibration_texts[:calibration_samples]
    ]
    
    # GPTQ config
    config = BaseQuantizeConfig(
        bits=bits,
        group_size=group_size,
        desc_act=True,
        sym=False,
        damp_percent=0.1
    )
    
    # Load and quantize
    clear_memory()
    start_time = time.time()
    
    model = AutoGPTQForCausalLM.from_pretrained(model_id, config)
    model.quantize(calibration_data, batch_size=4)
    
    quant_time = time.time() - start_time
    print(f"Quantization completed in {quant_time:.1f}s")
    
    # Save
    model_name = model_id.split('/')[-1]
    save_path = f"./quantized_{model_name}_gptq_{bits}bit_g{group_size}"
    os.makedirs(save_path, exist_ok=True)
    
    model.save_quantized(save_path)
    tokenizer.save_pretrained(save_path)
    
    # Report size
    total_size = sum(
        os.path.getsize(os.path.join(save_path, f))
        for f in os.listdir(save_path)
        if f.endswith('.safetensors') or f.endswith('.bin')
    ) / 1e6
    
    print(f"Saved to {save_path}")
    print(f"Model size: {total_size:.1f} MB")
    
    del model
    clear_memory()
    
    return save_path

# Example usage:
# save_path = quantize_model_gptq("facebook/opt-1.3b", group_size=128)
# save_path = quantize_model_gptq("meta-llama/Llama-2-7b-hf", group_size=128)
print("GPTQ quantization function defined")

## Exercise 2: Custom Calibration Data

Create domain-specific calibration data for different use cases.

In [None]:
# Domain-specific calibration data examples

# For a coding assistant
CODE_CALIBRATION = [
    """def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)""",
    
    """class DataProcessor:
    def __init__(self, data):
        self.data = data
    
    def process(self):
        return [x * 2 for x in self.data]""",
    
    """import numpy as np
import pandas as pd

def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)
    return df.dropna().reset_index(drop=True)""",
    
    """async def fetch_data(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.json()""",
    
    """@dataclass
class Config:
    learning_rate: float = 0.001
    batch_size: int = 32
    epochs: int = 100""",
]

# For a scientific assistant
SCIENCE_CALIBRATION = [
    "The mitochondria is the powerhouse of the cell, responsible for ATP production through oxidative phosphorylation.",
    "Quantum entanglement occurs when particles become correlated such that the quantum state of each particle cannot be described independently.",
    "Neural networks approximate functions through compositions of linear transformations and nonlinear activations.",
    "The Standard Model describes fundamental particles: quarks, leptons, and gauge bosons mediating the electromagnetic, weak, and strong forces.",
    "CRISPR-Cas9 enables precise genome editing by using guide RNA to direct the Cas9 nuclease to specific DNA sequences.",
]

# For a chatbot
CHAT_CALIBRATION = [
    "User: How are you today?\nAssistant: I'm doing well, thank you for asking! How can I help you today?",
    "User: What's the weather like?\nAssistant: I don't have access to real-time weather data, but I can help you find local weather services or forecast websites.",
    "User: Tell me a joke.\nAssistant: Why did the programmer quit? Because they didn't get arrays! Would you like to hear another one?",
    "User: Can you help me with my homework?\nAssistant: Of course! I'd be happy to help. What subject are you working on?",
    "User: Thank you for your help!\nAssistant: You're welcome! Feel free to ask if you have any other questions.",
]


def create_domain_calibration(domain: str, num_samples: int = 128) -> list:
    """
    Create domain-specific calibration data.
    
    Args:
        domain: One of 'code', 'science', 'chat'
        num_samples: Number of samples to generate
    
    Returns:
        List of calibration texts
    """
    domain_data = {
        'code': CODE_CALIBRATION,
        'science': SCIENCE_CALIBRATION,
        'chat': CHAT_CALIBRATION,
    }
    
    if domain not in domain_data:
        raise ValueError(f"Unknown domain: {domain}. Choose from: {list(domain_data.keys())}")
    
    base = domain_data[domain]
    # Extend to desired number
    extended = (base * ((num_samples // len(base)) + 1))[:num_samples]
    return extended


# Test
print("Domain calibration examples:")
for domain in ['code', 'science', 'chat']:
    samples = create_domain_calibration(domain, 10)
    print(f"  {domain}: {len(samples)} samples")
    print(f"    Sample: {samples[0][:80]}...")

In [None]:
# Complete example: Quantize with domain-specific calibration

def quantize_for_domain(
    model_id: str,
    domain: str,
    group_size: int = 128
) -> str:
    """
    Quantize a model with domain-specific calibration data.
    
    Args:
        model_id: HuggingFace model ID
        domain: 'code', 'science', or 'chat'
        group_size: GPTQ group size
    
    Returns:
        Path to saved model
    """
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
    from transformers import AutoTokenizer
    
    print(f"Quantizing {model_id} for {domain} domain...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Get domain-specific calibration data
    calibration_texts = create_domain_calibration(domain, num_samples=128)
    calibration_data = [
        tokenizer.encode(text, truncation=True, max_length=512)
        for text in calibration_texts
    ]
    
    # GPTQ config
    config = BaseQuantizeConfig(
        bits=4,
        group_size=group_size,
        desc_act=True
    )
    
    # Quantize
    model = AutoGPTQForCausalLM.from_pretrained(model_id, config)
    model.quantize(calibration_data, batch_size=4)
    
    # Save
    model_name = model_id.split('/')[-1]
    save_path = f"./quantized_{model_name}_gptq_{domain}"
    os.makedirs(save_path, exist_ok=True)
    model.save_quantized(save_path)
    tokenizer.save_pretrained(save_path)
    
    print(f"Saved to {save_path}")
    
    del model
    clear_memory()
    
    return save_path

# Example usage:
# code_model = quantize_for_domain("facebook/opt-350m", "code")
# chat_model = quantize_for_domain("facebook/opt-350m", "chat")
print("Domain-specific quantization function defined")

---

## Key Takeaways

1. **GPTQ works on any causal LM** - OPT, Llama, Mistral, etc.
2. **Calibration data matters** - Use domain-specific data for best results
3. **Group size tradeoff** - Smaller = better quality, larger = faster
4. **desc_act=True** - Almost always improves quality