# Task 11.5 Solutions: FP4 Deep Dive

This notebook contains solutions to the exercises from Task 11.5.

---

In [None]:
# Common imports
import torch
import numpy as np
import gc
import time
import math
import subprocess
from tqdm import tqdm

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def clear_buffer_cache():
    """Clear system buffer cache for optimal memory availability."""
    try:
        subprocess.run(
            "sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'",
            shell=True, check=True, capture_output=True
        )
        print("Buffer cache cleared")
    except subprocess.CalledProcessError:
        print("Note: Could not clear buffer cache (may need sudo)")

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Exercise 1: Quantize Llama 2 7B with NVFP4

Apply FP4 quantization to a larger model and measure quality/speed tradeoffs.

In [None]:
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset

def quantize_model_fp4(
    model_id: str,
    calibration_samples: int = 128,
    use_nvfp4: bool = True
) -> tuple:
    """
    Quantize a model to FP4 using TensorRT Model Optimizer.
    
    Args:
        model_id: HuggingFace model ID
        calibration_samples: Number of calibration samples
        use_nvfp4: Use NVFP4 (True) or MXFP4 (False)
    
    Returns:
        Tuple of (quantized_model, tokenizer)
    """
    print(f"Loading {model_id}...")
    
    # Clear memory before loading large model
    clear_buffer_cache()
    clear_memory()
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="cuda"
    )
    
    fp16_memory = torch.cuda.memory_allocated() / 1e9
    print(f"FP16 memory: {fp16_memory:.2f} GB")
    
    # Create calibration data
    calibration_texts = [
        "Machine learning is transforming the world of technology.",
        "Neural networks learn complex patterns from data.",
        "Deep learning enables breakthrough applications in AI.",
        "The transformer architecture revolutionized NLP.",
        "Quantization reduces model precision for efficiency.",
        "GPU acceleration enables fast neural network training.",
        "Transfer learning leverages pre-trained knowledge.",
        "Attention mechanisms help focus on relevant information.",
    ] * ((calibration_samples // 8) + 1)
    calibration_texts = calibration_texts[:calibration_samples]
    
    # Tokenize
    encodings = tokenizer(
        calibration_texts,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    )
    
    dataset = TensorDataset(encodings.input_ids, encodings.attention_mask)
    calib_loader = DataLoader(dataset, batch_size=8)
    
    # Calibration forward function
    def calib_forward(model):
        model.eval()
        with torch.no_grad():
            for batch_idx, (input_ids, attention_mask) in enumerate(calib_loader):
                _ = model(
                    input_ids=input_ids.cuda(),
                    attention_mask=attention_mask.cuda()
                )
                if batch_idx >= 15:
                    break
    
    # Apply quantization
    print(f"Applying {'NVFP4' if use_nvfp4 else 'MXFP4'} quantization...")
    start_time = time.time()
    
    try:
        if use_nvfp4:
            config = mtq.NVFP4_DEFAULT_CFG
        else:
            config = mtq.MXFP4_DEFAULT_CFG
        
        model_fp4 = mtq.quantize(model, config, forward_loop=calib_forward)
        quant_time = time.time() - start_time
        print(f"Quantization complete in {quant_time:.1f}s")
        
    except Exception as e:
        print(f"FP4 not available: {e}")
        print("Falling back to FP8...")
        
        config = mtq.FP8_DEFAULT_CFG
        model_fp4 = mtq.quantize(model, config, forward_loop=calib_forward)
    
    # Measure FP4 memory
    clear_memory()
    fp4_memory = torch.cuda.memory_allocated() / 1e9
    print(f"FP4 memory: {fp4_memory:.2f} GB")
    print(f"Memory reduction: {fp16_memory/fp4_memory:.2f}x")
    
    return model_fp4, tokenizer


# Example usage (uncomment to run on Blackwell):
# model_fp4, tokenizer = quantize_model_fp4("meta-llama/Llama-2-7b-hf")
print("FP4 quantization function defined")

In [None]:
# Evaluate FP4 model quality

def evaluate_fp4_quality(
    model,
    tokenizer,
    eval_texts: list = None
) -> dict:
    """
    Evaluate FP4 model quality with perplexity and generation tests.
    
    Args:
        model: Quantized model
        tokenizer: Tokenizer
        eval_texts: List of texts for perplexity evaluation
    
    Returns:
        Dictionary with evaluation results
    """
    if eval_texts is None:
        eval_texts = [
            "The quick brown fox jumps over the lazy dog.",
            "Machine learning enables computers to learn from data.",
            "The ancient civilization built impressive structures.",
            "Modern medicine has extended human lifespan.",
            "Climate models predict significant changes.",
        ]
    
    model.eval()
    
    # Calculate perplexity
    print("Calculating perplexity...")
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for text in tqdm(eval_texts, desc="Perplexity"):
            encodings = tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                max_length=256
            )
            input_ids = encodings.input_ids.to(model.device)
            
            if input_ids.size(1) < 2:
                continue
            
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss.item()
            num_tokens = input_ids.size(1) - 1
            
            total_loss += loss * num_tokens
            total_tokens += num_tokens
    
    perplexity = math.exp(total_loss / total_tokens)
    
    # Test generation
    print("Testing generation...")
    prompt = "The future of artificial intelligence"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Warmup
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=10, do_sample=False)
    
    # Benchmark
    torch.cuda.synchronize()
    start = time.perf_counter()
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )
    
    torch.cuda.synchronize()
    gen_time = time.perf_counter() - start
    
    tokens_generated = outputs.shape[1] - inputs['input_ids'].shape[1]
    tokens_per_second = tokens_generated / gen_time
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    results = {
        'perplexity': perplexity,
        'tokens_per_second': tokens_per_second,
        'generated_text': generated_text
    }
    
    print(f"\nResults:")
    print(f"  Perplexity: {perplexity:.2f}")
    print(f"  Speed: {tokens_per_second:.1f} tok/s")
    print(f"  Generated: {generated_text[:100]}...")
    
    return results


# Example usage:
# results = evaluate_fp4_quality(model_fp4, tokenizer)
print("FP4 evaluation function defined")

## Exercise 2: Compare Calibration Data Quality

Try quantizing with different calibration datasets and compare quality.

In [None]:
# Different calibration data types

RANDOM_CALIBRATION = [
    "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
    "The quick brown fox jumps over the lazy dog repeatedly.",
    "Random words strung together in sentences for testing.",
    "Numbers and letters 123 ABC mixed together here.",
] * 32

DOMAIN_CALIBRATION = [
    "Machine learning is a subset of artificial intelligence that enables systems to learn from data.",
    "Deep learning uses neural networks with multiple layers to extract complex patterns.",
    "Natural language processing allows computers to understand and generate human language.",
    "Computer vision enables machines to interpret and analyze visual information from the world.",
    "Reinforcement learning trains agents through rewards and penalties in an environment.",
    "Transfer learning leverages knowledge from one task to improve performance on another.",
    "The transformer architecture revolutionized natural language processing in 2017.",
    "Attention mechanisms help models focus on relevant parts of the input.",
] * 16

CODE_CALIBRATION = [
    "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
    "class DataProcessor:\n    def __init__(self, data):\n        self.data = data\n    def process(self):\n        return [x * 2 for x in self.data]",
    "import numpy as np\ndef matrix_multiply(a, b):\n    return np.dot(a, b)",
    "async def fetch_data(url):\n    async with aiohttp.ClientSession() as session:\n        async with session.get(url) as response:\n            return await response.json()",
    "for i in range(10):\n    print(f'Iteration {i}')",
    "try:\n    result = process_data(input)\nexcept Exception as e:\n    logger.error(f'Error: {e}')",
] * 22

print(f"Random calibration samples: {len(RANDOM_CALIBRATION)}")
print(f"Domain calibration samples: {len(DOMAIN_CALIBRATION)}")
print(f"Code calibration samples: {len(CODE_CALIBRATION)}")

In [None]:
def compare_calibration_quality(
    model_id: str,
    calibration_datasets: dict
) -> dict:
    """
    Compare FP4 quantization quality with different calibration datasets.
    
    Args:
        model_id: HuggingFace model ID
        calibration_datasets: Dict mapping dataset name to list of texts
    
    Returns:
        Dict with results for each calibration dataset
    """
    results = {}
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Evaluation texts (held out from calibration)
    eval_texts = [
        "The history of computing spans several decades of innovation.",
        "Scientific research requires careful methodology and analysis.",
        "Economic factors influence market behavior significantly.",
        "Space exploration continues to push boundaries of knowledge.",
        "Medical advances have improved human health outcomes.",
    ]
    
    for name, calib_texts in calibration_datasets.items():
        print(f"\n{'='*60}")
        print(f"Testing calibration: {name}")
        print(f"{'='*60}")
        
        clear_buffer_cache()
        clear_memory()
        
        # Load fresh model
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="cuda"
        )
        
        # Create calibration dataloader
        encodings = tokenizer(
            calib_texts[:128],
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt'
        )
        
        dataset = TensorDataset(encodings.input_ids, encodings.attention_mask)
        calib_loader = DataLoader(dataset, batch_size=8)
        
        def calib_forward(model):
            model.eval()
            with torch.no_grad():
                for batch_idx, (input_ids, attention_mask) in enumerate(calib_loader):
                    _ = model(
                        input_ids=input_ids.cuda(),
                        attention_mask=attention_mask.cuda()
                    )
                    if batch_idx >= 15:
                        break
        
        # Quantize
        try:
            config = mtq.NVFP4_DEFAULT_CFG
            model_fp4 = mtq.quantize(model, config, forward_loop=calib_forward)
        except Exception as e:
            print(f"FP4 not available, using FP8: {e}")
            config = mtq.FP8_DEFAULT_CFG
            model_fp4 = mtq.quantize(model, config, forward_loop=calib_forward)
        
        # Evaluate
        eval_result = evaluate_fp4_quality(model_fp4, tokenizer, eval_texts)
        results[name] = eval_result
        
        del model, model_fp4
        clear_memory()
    
    # Summary
    print(f"\n{'='*60}")
    print("Calibration Comparison Summary")
    print(f"{'='*60}")
    print(f"{'Dataset':<20} {'Perplexity':>12} {'Speed (tok/s)':>15}")
    print("-"*50)
    
    for name, result in results.items():
        print(f"{name:<20} {result['perplexity']:>12.2f} {result['tokens_per_second']:>15.1f}")
    
    # Find best
    best = min(results.items(), key=lambda x: x[1]['perplexity'])
    print(f"\nBest calibration: {best[0]} (PPL: {best[1]['perplexity']:.2f})")
    
    return results


# Example usage (uncomment to run):
# results = compare_calibration_quality(
#     "facebook/opt-350m",
#     {
#         "Random": RANDOM_CALIBRATION,
#         "Domain-specific": DOMAIN_CALIBRATION,
#         "Code": CODE_CALIBRATION
#     }
# )
print("Calibration comparison function defined")

---

## Key Takeaways

1. **FP4 is Blackwell-exclusive** - Native tensor core support for maximum performance
2. **Calibration data matters** - Domain-specific data typically yields better quality
3. **3.5Ã— memory reduction** - Enables running larger models on DGX Spark
4. **<1% quality loss** - With proper calibration and dual-level scaling