# Pytorch Quantization for RESNET

- Train a floating point model or load a pre-trained floating point model.
- Move the model to CPU and switch model to training mode.
- Apply layer fusion.
- Switch model to evaluation mode, check if the layer fusion results in correct model, and switch back to training mode.
- Apply torch.quantization.QuantStub() and torch.quantization.QuantStub() to the inputs and outputs, respectively.
- Specify quantization configurations, such as symmetric quantization or asymmetric quantization, etc.
- Prepare quantization model for quantization aware training.
- Move the model to CUDA and run quantization aware training using CUDA.
- Move the model to CPU and convert the quantization aware trained floating point model to quantized integer model.
- [Optional] Verify accuracies and inference performance gain.
- Save the quantized integer model.

In [1]:
import os
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms

import time
import copy
import numpy as np
from fedlern.train_utils import *
from fedlern.quant_utils import *
from fedlern.models.resnet_v2 import *


# Static Quantization

In [2]:

num_classes = 10
stats = (0.49139968, 0.48215841, 0.44653091), (0.24703223, 0.24348513, 0.26158784)

cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")



In [3]:
model_dir = "saved_models"
model_filename = 'resnet18v2_cifar10.pt'
quantized_model_filename = "resnet18_quantized_cifar10pytorch.pt"
model_filepath = os.path.join(model_dir, model_filename)
quantized_model_filepath = os.path.join(model_dir, quantized_model_filename)

In [13]:
train_loader, test_loader = prepare_dataloader_cifar(num_workers=8, train_batch_size=128, eval_batch_size=256, stats=stats)


# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32 = QuantizedResNet(model_fp32=ResNet18())
model_fp32.eval()

model3 = ResNet18()


Files already downloaded and verified


In [21]:
# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'x86' for server inference and 'qnnpack'
# for mobile inference. Other quantization configurations such as selecting
# symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
# can be specified here.
# Note: the old 'fbgemm' is still available but 'x86' is the recommended default
# for server inference.
# model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
model_fp32.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
# The model has to be switched to training mode before any layer fusion.
# Otherwise the quantization aware training will not work correctly.

# Fuse the model in place rather manually.
# Fuse the first Conv2d, BatchNorm2d, and ReLU layers
model_fp32_fused = torch.quantization.fuse_modules(model_fp32.model_fp32, [["conv1", "bn1", "relu"]])
# Fuse the remaining Conv2d and BatchNorm2d layers in the ResNet blocks
for name, module in model_fp32_fused.named_modules():
    if isinstance(module, nn.Sequential):
        for i in range(len(module)):
            if isinstance(module[i], BasicBlock) or isinstance(module[i], Bottleneck):
                module[i] = torch.quantization.fuse_modules(module[i], [["conv1", "bn1", "relu"], ["conv2", "bn2"]], inplace = True)

# Prepare the model for QAT. This inserts observers and fake_quants in
# the model needs to be set to train for QAT logic to work
# the model that will observe weight and activation tensors during calibration.
model_fp32_prepared = torch.ao.quantization.prepare_qat(model_fp32_fused.train())






In [23]:
# run the training loop (not shown)
train_model(model=model_fp32_prepared,
            train_loader=train_loader,
            test_loader=test_loader,
            num_epochs=20,
            device=cpu_device)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, fuses modules where appropriate,
# and replaces key operators with quantized implementations.
model_fp32_prepared.eval()
model_int8 = torch.ao.quantization.convert(model_fp32_prepared)




In [24]:
_, int8_eval_accuracy = evaluate_model(model=model_int8, test_loader=test_loader, device=cpu_device, criterion=None)


print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))

# int8_cpu_inference_latency = measure_inference_latency(model=quantized_model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
# int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)


# print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
# print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))


AttributeError: 'tuple' object has no attribute 'eval'