In [3]:
import sys
sys.path.append('./python')

import numpy as np
import needle as ndl
from needle import backend_ndarray as nd
import time
import matplotlib.pyplot as plt

print("Needle DL Framework - Quantization Demo")
print("=" * 50)

Using needle backend


ImportError: cannot import name 'ndarray_backend_cpu' from partially initialized module 'needle.backend_ndarray' (most likely due to a circular import) (/Users/tyler/local-docs/CMU/DL-Systems-Project/python/needle/backend_ndarray/__init__.py)

## 1. Basic NDArray Quantization

Let's start with basic quantization of NDArrays.

In [None]:
# Create a float32 array on CUDA
np.random.seed(42)
data_fp32 = np.random.randn(5, 4).astype(np.float32)
arr_fp32 = nd.NDArray(data_fp32, device=ndl.cuda())

print("Original Float32 Array:")
print(arr_fp32.numpy())
print(f"\nDtype: {arr_fp32.dtype}")
print(f"Shape: {arr_fp32.shape}")

In [None]:
# Quantize to uint8
arr_uint8 = arr_fp32.quantize_uint8()

print("Quantized UInt8 Array:")
print(arr_uint8.numpy())
print(f"\nDtype: {arr_uint8.dtype}")
print(f"Quantization Scale: {arr_uint8.quant_params.scale:.6f}")
print(f"Quantization Zero Point: {arr_uint8.quant_params.zero_point}")

In [None]:
# Dequantize back to float32
arr_dequant = arr_uint8.dequantize()

print("Dequantized Array (back to Float32):")
print(arr_dequant.numpy())

# Calculate quantization error
error = np.abs(arr_fp32.numpy() - arr_dequant.numpy())
print(f"\nQuantization Error:")
print(f"  Max Error: {error.max():.6f}")
print(f"  Mean Error: {error.mean():.6f}")
print(f"  RMS Error: {np.sqrt((error**2).mean()):.6f}")

## 2. Quantized Matrix Multiplication

The key operation for neural networks is matrix multiplication. Let's compare float32 vs uint8 matmul.

In [None]:
# Create two matrices for multiplication on CUDA
np.random.seed(123)
A_np = np.random.randn(100, 50).astype(np.float32) * 2.0
B_np = np.random.randn(50, 80).astype(np.float32) * 2.0

A_fp32 = nd.NDArray(A_np, device=ndl.cuda())
B_fp32 = nd.NDArray(B_np, device=ndl.cuda())

print(f"Matrix A shape: {A_fp32.shape}")
print(f"Matrix B shape: {B_fp32.shape}")

In [None]:
# Float32 matmul
start = time.time()
C_fp32 = A_fp32 @ B_fp32
time_fp32 = time.time() - start

print(f"Float32 Matmul:")
print(f"  Result shape: {C_fp32.shape}")
print(f"  Time: {time_fp32*1000:.4f} ms")
print(f"  Sample output:\n{C_fp32.numpy()[:3, :3]}")

In [None]:
# Quantize inputs
A_uint8 = A_fp32.quantize_uint8()
B_uint8 = B_fp32.quantize_uint8()

print(f"Quantized matrices:")
print(f"  A dtype: {A_uint8.dtype}, scale: {A_uint8.quant_params.scale:.6f}")
print(f"  B dtype: {B_uint8.dtype}, scale: {B_uint8.quant_params.scale:.6f}")

In [None]:
# Quantized matmul (automatically uses uint8 kernel)
start = time.time()
C_uint8 = A_uint8 @ B_uint8
time_uint8 = time.time() - start

print(f"UInt8 Quantized Matmul:")
print(f"  Result shape: {C_uint8.shape}")
print(f"  Result dtype: {C_uint8.dtype} (automatically converted to float32)")
print(f"  Time: {time_uint8*1000:.4f} ms")
print(f"  Sample output:\n{C_uint8.numpy()[:3, :3]}")

# Compare
print(f"\nComparison:")
print(f"  Speedup: {time_fp32/time_uint8:.2f}x")
error = np.abs(C_fp32.numpy() - C_uint8.numpy())
print(f"  Max Error: {error.max():.6f}")
print(f"  Mean Error: {error.mean():.6f}")
print(f"  Relative Error: {(error.mean() / np.abs(C_fp32.numpy()).mean()) * 100:.2f}%")

## 3. Tensor-Level Quantization

Now let's test quantization at the Tensor level (higher-level API).

In [None]:
# Create tensors on CUDA
np.random.seed(456)
x_np = np.random.randn(4, 3).astype(np.float32)
x = ndl.Tensor(x_np, dtype="float32", device=ndl.cuda())

print("Original Tensor:")
print(x.numpy())
print(f"Dtype: {x.dtype}")

In [None]:
# Quantize
x_quant = x.quantize_uint8()

print("Quantized Tensor:")
print(x_quant.numpy())
print(f"Dtype: {x_quant.dtype}")

# Dequantize
x_dequant = x_quant.dequantize()

print("\nDequantized Tensor:")
print(x_dequant.numpy())

error = np.abs(x.numpy() - x_dequant.numpy())
print(f"\nMax error: {error.max():.6f}")

## 4. Quantized Neural Network Layers

The main application: quantized Linear layers for neural networks!

In [None]:
# Create a simple 2-layer network on CUDA
class SimpleNet(ndl.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = ndl.nn.Linear(10, 20, bias=True, device=ndl.cuda())
        self.fc2 = ndl.nn.Linear(20, 5, bias=True, device=ndl.cuda())
    
    def forward(self, x):
        x = self.fc1(x)
        x = ndl.ops.relu(x)
        x = self.fc2(x)
        return x

net = SimpleNet()
print("Network created with 2 linear layers")

In [None]:
# Create input
np.random.seed(789)
x_np = np.random.randn(8, 10).astype(np.float32)
x = ndl.Tensor(x_np, dtype="float32")

# Forward pass in training mode (float32)
net.train()
start = time.time()
y_train = net(x)
time_train = time.time() - start

print(f"Training Mode (Float32):")
print(f"  Output shape: {y_train.shape}")
print(f"  Time: {time_train*1000:.4f} ms")
print(f"  Sample output:\n{y_train.numpy()[:2]}")

In [None]:
# Quantize weights for inference
net.eval()
net.fc1.quantize_weights()
net.fc2.quantize_weights()

print("Weights quantized!")
print(f"  FC1 quantized: {net.fc1.quantized}")
print(f"  FC2 quantized: {net.fc2.quantized}")

In [None]:
# Forward pass with quantized weights
start = time.time()
y_quant = net(x)
time_quant = time.time() - start

print(f"Inference Mode (Quantized UInt8):")
print(f"  Output shape: {y_quant.shape}")
print(f"  Time: {time_quant*1000:.4f} ms")
print(f"  Sample output:\n{y_quant.numpy()[:2]}")

# Compare
print(f"\nComparison:")
print(f"  Speedup: {time_train/time_quant:.2f}x")
error = np.abs(y_train.numpy() - y_quant.numpy())
print(f"  Max Error: {error.max():.6f}")
print(f"  Mean Error: {error.mean():.6f}")
print(f"  Relative Error: {(error / (np.abs(y_train.numpy()) + 1e-8)).mean() * 100:.2f}%")

## 5. Memory Usage Comparison

One of the key benefits of quantization is reduced memory usage.

In [None]:
# Create larger matrices to see memory difference
size = 1000
large_fp32 = np.random.randn(size, size).astype(np.float32)
large_uint8 = np.random.randint(0, 256, (size, size), dtype=np.uint8)

mem_fp32 = large_fp32.nbytes / (1024 * 1024)  # MB
mem_uint8 = large_uint8.nbytes / (1024 * 1024)  # MB

print(f"Matrix size: {size}x{size}")
print(f"\nMemory Usage:")
print(f"  Float32:  {mem_fp32:.2f} MB")
print(f"  UInt8:    {mem_uint8:.2f} MB")
print(f"  Reduction: {mem_fp32/mem_uint8:.2f}x")

# Visualize
fig, ax = plt.subplots(figsize=(8, 5))
types = ['Float32', 'UInt8']
memory = [mem_fp32, mem_uint8]
colors = ['#ff6b6b', '#4ecdc4']

bars = ax.bar(types, memory, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Memory Usage (MB)', fontsize=12)
ax.set_title(f'Memory Usage Comparison\n({size}x{size} Matrix)', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(memory) * 1.2)

# Add value labels on bars
for bar, val in zip(bars, memory):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.2f} MB',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()

print(f"\n✓ 4x memory reduction achieved!")

## 6. Error Analysis Across Different Scales

Let's analyze how quantization error varies with input magnitude.

In [None]:
# Test quantization error at different scales
scales = [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
max_errors = []
mean_errors = []

for scale in scales:
    data = np.random.randn(100, 100).astype(np.float32) * scale
    arr_fp32 = nd.NDArray(data)
    arr_uint8 = arr_fp32.quantize_uint8()
    arr_dequant = arr_uint8.dequantize()
    
    error = np.abs(arr_fp32.numpy() - arr_dequant.numpy())
    max_errors.append(error.max())
    mean_errors.append(error.mean())

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(scales, max_errors, 'o-', linewidth=2, markersize=8, color='#e74c3c')
ax1.set_xlabel('Input Scale', fontsize=12)
ax1.set_ylabel('Max Quantization Error', fontsize=12)
ax1.set_title('Max Error vs Input Scale', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.set_xscale('log')

ax2.plot(scales, mean_errors, 'o-', linewidth=2, markersize=8, color='#3498db')
ax2.set_xlabel('Input Scale', fontsize=12)
ax2.set_ylabel('Mean Quantization Error', fontsize=12)
ax2.set_title('Mean Error vs Input Scale', fontsize=13, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.set_xscale('log')

plt.tight_layout()
plt.show()

print("\nObservation: Quantization error scales proportionally with input magnitude.")

## 7. Summary

### Key Benefits of UInt8 Quantization:

1. **Memory Efficiency**: 4x reduction (32 bits → 8 bits)
2. **Performance**: Potential speedup from integer arithmetic
3. **Energy Efficiency**: Lower power consumption
4. **Deployment**: Better for edge devices and mobile

### Tradeoffs:

1. **Accuracy Loss**: Small quantization error (typically < 1%)
2. **Limited Range**: 256 discrete values (vs continuous float32)
3. **Implementation**: Requires careful scaling

### When to Use:

- ✓ Inference on edge devices
- ✓ Large model deployment
- ✓ Memory-constrained environments
- ✓ Production inference servers

### When NOT to Use:

- ✗ Training (need high precision gradients)
- ✗ Tasks requiring high numerical precision
- ✗ Small models where memory isn't an issue

## 8. Next Steps

This implementation provides:
- ✓ NDArray-level quantization (quantize_uint8, dequantize, astype)
- ✓ Tensor-level quantization API
- ✓ Quantized matmul for uint8 inputs
- ✓ Quantized Linear layers
- ✓ Automatic dequantization for unsupported ops
- ✓ NumPy backend support

### Future Enhancements:
1. C++/CUDA backend implementations for better performance
2. Per-channel quantization for better accuracy
3. Quantization-aware training (QAT)
4. Dynamic quantization
5. Support for other quantized operations (conv, etc.)
6. Mixed precision (some layers quantized, others not)

---

**Thank you for exploring the UInt8 quantization feature!**