# Week 2 Day 4 – MobileNetV2 INT8 Quantization (Instructor Version)

This notebook benchmarks **FP32 vs INT8 MobileNetV2** on CPU (Raspberry Pi 5 or similar).
- Uses TorchVision pretrained + quantized models.
- Reports model size and latency.
- Designed to be clean and easy to adapt.


In [None]:
import os
import time
import psutil

import torch
import torch.nn as nn
from torchvision import models

print("PyTorch:", torch.__version__)
print("Torchvision:", getattr(models, '__version__', 'N/A'))
print("CUDA available:", torch.cuda.is_available())

In [None]:
device = torch.device("cpu")
print("Using device:", device)

def measure_latency(model: nn.Module, example_input: torch.Tensor, 
                    n_warmup: int = 10, n_iters: int = 50) -> float:
    model.eval()
    model.to(device)
    example_input = example_input.to(device)
    
    # Warmup
    with torch.no_grad():
        for _ in range(n_warmup):
            _ = model(example_input)
    
    start = time.perf_counter()
    with torch.no_grad():
        for _ in range(n_iters):
            _ = model(example_input)
    end = time.perf_counter()
    
    return (end - start) * 1000.0 / n_iters  # ms


def get_model_size_mb(model: nn.Module, filename: str) -> float:
    torch.save(model.state_dict(), filename)
    size_mb = os.path.getsize(filename) / (1024 * 1024)
    return size_mb

In [None]:
# FP32 MobileNetV2
try:
    fp32_model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
except Exception:
    fp32_model = models.mobilenet_v2(pretrained=True)

fp32_model.eval()
fp32_model.to(device)

example_input = torch.randn(1, 3, 224, 224)

# Sanity check
with torch.no_grad():
    _ = fp32_model(example_input.to(device))

process = psutil.Process(os.getpid())

fp32_size = get_model_size_mb(fp32_model, "mobilenet_v2_fp32.pth")
fp32_latency = measure_latency(fp32_model, example_input, n_warmup=5, n_iters=20)
fp32_rss = process.memory_info().rss / (1024 * 1024)

print(f"FP32 size:    {fp32_size:.2f} MB")
print(f"FP32 latency: {fp32_latency:.2f} ms")
print(f"FP32 RSS:     {fp32_rss:.2f} MB")

In [None]:
# INT8 MobileNetV2 (quantized)
try:
    from torchvision.models.quantization import mobilenet_v2 as q_mobilenet_v2
    
    int8_model = q_mobilenet_v2(pretrained=True, quantize=True)
    int8_model.eval()
    int8_model.to(device)
    
    with torch.no_grad():
        _ = int8_model(example_input.to(device))
    
    process = psutil.Process(os.getpid())
    
    int8_size = get_model_size_mb(int8_model, "mobilenet_v2_int8.pth")
    int8_latency = measure_latency(int8_model, example_input, n_warmup=5, n_iters=20)
    int8_rss = process.memory_info().rss / (1024 * 1024)
    
    print(f"INT8 size:    {int8_size:.2f} MB")
    print(f"INT8 latency: {int8_latency:.2f} ms")
    print(f"INT8 RSS:     {int8_rss:.2f} MB")
    
    speedup = fp32_latency / int8_latency if int8_latency > 0 else float('inf')
    print(f"Speed-up vs FP32: {speedup:.2f}x")
except Exception as e:
    print("Failed to load quantized MobileNetV2:", e)
    int8_model = None

## Summary (for README)

Use the printed values above to populate a small table like:

| Model        | Precision | Size (MB) | Latency (ms) | Speed-up vs FP32 |
|--------------|-----------|-----------|--------------|------------------|
| MobileNetV2  | FP32      | …         | …            | 1.0x             |
| MobileNetV2  | INT8      | …         | …            | …                |

This matches the **Week 2 Day 4** goal: demonstrate that quantization reduces model size and can improve latency on CPU-only edge devices.
