# Lab 3.2.2: NVFP4 Quantization - Solutions

This notebook contains solutions for all exercises in Lab 3.2.2.

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt

import sys
sys.path.append('..')
from scripts import quantize_to_fp4, dequantize_from_fp4

## Exercise 1 Solution: Micro-Block Size Comparison

Experiment with different micro-block sizes and analyze their effect on quality.

In [None]:
def compare_microblock_sizes(weights: np.ndarray, block_sizes: list) -> dict:
    """
    Compare NVFP4 quantization with different micro-block sizes.
    
    Args:
        weights: Weight tensor to quantize
        block_sizes: List of block sizes to try
        
    Returns:
        Dictionary with results for each block size
    """
    results = {}
    
    for block_size in block_sizes:
        # Quantize
        quantized, scales, _ = quantize_to_fp4(weights, block_size=block_size)
        
        # Dequantize
        reconstructed = dequantize_from_fp4(quantized, scales, block_size=block_size)
        
        # Calculate error metrics
        mse = np.mean((weights - reconstructed) ** 2)
        max_error = np.max(np.abs(weights - reconstructed))
        snr = 10 * np.log10(np.mean(weights ** 2) / (mse + 1e-10))
        
        # Calculate overhead (scales storage)
        num_blocks = np.ceil(weights.size / block_size)
        scale_bits = num_blocks * 8  # 8-bit scales
        weight_bits = weights.size * 4  # 4-bit weights
        total_bits = scale_bits + weight_bits
        effective_bits = total_bits / weights.size
        
        results[block_size] = {
            'mse': mse,
            'max_error': max_error,
            'snr_db': snr,
            'effective_bits': effective_bits,
            'compression': 32 / effective_bits
        }
        
        print(f"\nBlock Size {block_size}:")
        print(f"  MSE: {mse:.8f}")
        print(f"  SNR: {snr:.2f} dB")
        print(f"  Effective bits: {effective_bits:.2f}")
        print(f"  Compression: {results[block_size]['compression']:.1f}x")
    
    return results


# Test with realistic weight distribution
np.random.seed(42)
weights = np.random.randn(4096, 4096).astype(np.float32) * 0.02

print("NVFP4 Micro-Block Size Comparison")
print("="*50)
block_sizes = [16, 32, 64, 128, 256]
results = compare_microblock_sizes(weights.flatten(), block_sizes)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# SNR vs Block Size
ax1 = axes[0]
snrs = [results[bs]['snr_db'] for bs in block_sizes]
ax1.plot(block_sizes, snrs, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Micro-Block Size')
ax1.set_ylabel('SNR (dB)')
ax1.set_title('Quality vs Block Size')
ax1.set_xscale('log', base=2)
ax1.grid(True)

# Compression vs Block Size
ax2 = axes[1]
compressions = [results[bs]['compression'] for bs in block_sizes]
ax2.plot(block_sizes, compressions, 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('Micro-Block Size')
ax2.set_ylabel('Compression Ratio')
ax2.set_title('Compression vs Block Size')
ax2.set_xscale('log', base=2)
ax2.grid(True)

plt.tight_layout()
plt.show()

print("\n" + "="*50)
print("Recommendation: Block size 32 or 64 offers best quality/compression trade-off")

## Exercise 2 Solution: Layer-wise Analysis

Analyze which layers benefit most from FP4 quantization.

In [None]:
def analyze_layer_sensitivity(model_weights: dict, block_size: int = 32) -> dict:
    """
    Analyze FP4 quantization sensitivity per layer.
    
    Args:
        model_weights: Dictionary of layer_name -> weights
        block_size: Micro-block size for FP4
        
    Returns:
        Sensitivity analysis per layer
    """
    results = {}
    
    for name, weights in model_weights.items():
        flat_weights = weights.flatten()
        
        # Quantize to FP4
        quantized, scales, _ = quantize_to_fp4(flat_weights, block_size=block_size)
        reconstructed = dequantize_from_fp4(quantized, scales, block_size=block_size)
        
        # Error metrics
        mse = np.mean((flat_weights - reconstructed) ** 2)
        relative_error = np.mean(np.abs(flat_weights - reconstructed) / (np.abs(flat_weights) + 1e-10))
        
        # Weight statistics
        weight_range = flat_weights.max() - flat_weights.min()
        weight_std = np.std(flat_weights)
        outlier_ratio = np.mean(np.abs(flat_weights) > 3 * weight_std)
        
        results[name] = {
            'mse': mse,
            'relative_error': relative_error,
            'weight_range': weight_range,
            'weight_std': weight_std,
            'outlier_ratio': outlier_ratio,
            'sensitivity': relative_error * 100  # Sensitivity score
        }
    
    return results


# Simulate layers with different characteristics
np.random.seed(42)
simulated_layers = {
    'embed_tokens': np.random.randn(32000, 4096).astype(np.float32) * 0.01,
    'layer_0.attn.q_proj': np.random.randn(4096, 4096).astype(np.float32) * 0.02,
    'layer_0.attn.k_proj': np.random.randn(1024, 4096).astype(np.float32) * 0.02,
    'layer_0.attn.v_proj': np.random.randn(1024, 4096).astype(np.float32) * 0.02,
    'layer_0.attn.o_proj': np.random.randn(4096, 4096).astype(np.float32) * 0.02,
    'layer_0.mlp.gate_proj': np.random.randn(14336, 4096).astype(np.float32) * 0.015,
    'layer_0.mlp.up_proj': np.random.randn(14336, 4096).astype(np.float32) * 0.015,
    'layer_0.mlp.down_proj': np.random.randn(4096, 14336).astype(np.float32) * 0.015,
    'lm_head': np.random.randn(32000, 4096).astype(np.float32) * 0.01,
}

print("Layer-wise FP4 Quantization Sensitivity Analysis")
print("="*60)

layer_results = analyze_layer_sensitivity(simulated_layers)

# Sort by sensitivity
sorted_layers = sorted(layer_results.items(), key=lambda x: x[1]['sensitivity'], reverse=True)

print(f"\n{'Layer':<30} {'Sensitivity':<12} {'Outliers':<12}")
print("-"*54)
for name, metrics in sorted_layers:
    print(f"{name:<30} {metrics['sensitivity']:>8.2f}%    {metrics['outlier_ratio']*100:>8.2f}%")

print("\n" + "="*60)
print("Recommendation: Keep embedding and lm_head in higher precision if quality drops")

## Exercise 3 Solution: Custom FP4 Format

Implement a custom FP4 format with adjustable exponent/mantissa split.

In [None]:
def custom_fp4_quantize(values: np.ndarray, e_bits: int = 2, m_bits: int = 1, bias: int = 1) -> tuple:
    """
    Custom FP4 quantization with configurable format.
    
    4 bits total: 1 sign + e_bits exponent + m_bits mantissa
    
    Args:
        values: Values to quantize
        e_bits: Number of exponent bits (1-2)
        m_bits: Number of mantissa bits (1-2)
        bias: Exponent bias
        
    Returns:
        Tuple of (quantized codes, format info)
    """
    assert e_bits + m_bits == 3, "Must have 3 bits for exponent + mantissa"
    
    # Build representable values
    representable = [0.0]  # Zero
    
    for sign in [1, -1]:
        for exp in range(2**e_bits):
            for mant in range(2**m_bits):
                # Value = sign * 2^(exp-bias) * (1 + mant/2^m_bits)
                if exp == 0:  # Subnormal
                    val = sign * (2**(1-bias)) * (mant / 2**m_bits)
                else:  # Normal
                    val = sign * (2**(exp-bias)) * (1 + mant / 2**m_bits)
                if val != 0:
                    representable.append(val)
    
    representable = sorted(set(representable))
    
    # Quantize by nearest value
    quantized = np.zeros_like(values, dtype=np.int8)
    dequantized = np.zeros_like(values)
    
    for i, v in enumerate(values.flat):
        # Find nearest representable value
        idx = np.argmin(np.abs(np.array(representable) - v))
        quantized.flat[i] = idx
        dequantized.flat[i] = representable[idx]
    
    format_info = {
        'e_bits': e_bits,
        'm_bits': m_bits,
        'bias': bias,
        'representable': representable,
        'num_values': len(representable),
        'max_value': max(representable),
        'min_positive': min(v for v in representable if v > 0)
    }
    
    return quantized, dequantized, format_info


# Compare E2M1 vs E1M2 formats
np.random.seed(42)
test_values = np.random.randn(1000).astype(np.float32) * 0.5

print("Custom FP4 Format Comparison")
print("="*50)

formats = [
    (2, 1, 1, "E2M1 (NVFP4-like)"),
    (1, 2, 0, "E1M2 (higher precision)"),
]

for e_bits, m_bits, bias, name in formats:
    _, dequant, info = custom_fp4_quantize(test_values, e_bits, m_bits, bias)
    mse = np.mean((test_values - dequant) ** 2)
    
    print(f"\n{name}:")
    print(f"  Exponent bits: {e_bits}")
    print(f"  Mantissa bits: {m_bits}")
    print(f"  Unique values: {info['num_values']}")
    print(f"  Value range: [{-info['max_value']:.4f}, {info['max_value']:.4f}]")
    print(f"  Min positive: {info['min_positive']:.6f}")
    print(f"  MSE: {mse:.6f}")

## Summary

Key findings:

1. **Micro-block size 32-64** offers the best quality/compression trade-off for NVFP4
2. **Embedding and lm_head layers** are most sensitive to quantization
3. **E2M1 format** (NVFP4) provides better dynamic range for weights
4. **Blackwell's native FP4 support** eliminates software overhead