In [None]:
import time
import torch
import numpy as np
import os
import random

def measure_inference_latency(model,
                              device,
                              input_size=(1, 3, 32, 32),
                              num_samples=100,
                              num_warmups=10):

    model.to(device)
    model.eval()

    x = torch.rand(size=input_size).to(device)

    with torch.no_grad():
        for _ in range(num_warmups):
            _ = model(x)
    torch.cuda.synchronize()

    with torch.no_grad():
        start_time = time.time()
        for _ in range(num_samples):
            _ = model(x)
            torch.cuda.synchronize()
        end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ave = elapsed_time / num_samples

    return elapsed_time_ave

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

def model_equivalence(model_1, model_2, device, rtol=1e-05, atol=1e-08, num_tests=100, input_size=(1,3,32,32)):

    model_1.to(device)
    model_2.to(device)

    for i in range(num_tests):
        print(f"Running test {i+1}/{num_tests}")
        x = torch.rand(size=input_size).to(device)
        y1 = model_1(x).detach().cpu().numpy()
        y2 = model_2(x).detach().cpu().numpy()
        print("Difference: ", np.max(np.abs(y1-y2)))
        if np.allclose(a=y1, b=y2, rtol=rtol, atol=atol, equal_nan=False) == False:
            print("Model equivalence test sample failed: ")
            print(y1)
            print(y2)
            return False

    return True

def phinet_fuse_modules(model):
    for basic_block_name, basic_block in model.model.named_children():
        if "lat_features" in basic_block_name:
            torch.quantization.fuse_modules(basic_block, [["1.1", "1.2"],["2.1","2.2"],["2.4", "2.5"]], inplace=True)
            for sub_block in list(basic_block.children())[3:]:
                if len(sub_block) == 10:
                    torch.quantization.fuse_modules(sub_block, [["0", "1"],["5", "6"],["8", "9"]], inplace=True)
                if len(sub_block) == 9:
                    torch.quantization.fuse_modules(sub_block, [["0", "1"],["4", "5"],["7", "8"]], inplace=True)
        if "end_features" in basic_block_name:
            for sub_block in list(basic_block.children()):
                if len(sub_block) == 10:
                    torch.quantization.fuse_modules(sub_block, [["0", "1"],["5", "6"],["8", "9"]], inplace=True)
                if len(sub_block) == 9:
                    torch.quantization.fuse_modules(sub_block, [["0", "1"],["4", "5"],["7", "8"]], inplace=True)
    return model

def set_random_seeds(random_seed=0):

    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

In [None]:
from mobilenetv2 import MobilenetV2

model = MobilenetV2(latent_layer_num=6)
print(model)


In [None]:
import torch
import copy
from mobilenetv2 import MobilenetV2
from pytorchcv.models.mobilenet import mobilenet_w1

class QuantizedModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.quant = torch.quantization.QuantStub()
        self.model = model
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.model(x)
        x = self.dequant(x)
        return x
    
input_shape = (3, 224, 224)

model.eval()

# create a model instance
model_fp32 = QuantizedModel(model)

# model must be set to eval mode for static quantization logic to work
model_fp32.eval()

model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('x86')

model_fp32_fused = copy.deepcopy(model_fp32)

for basic_block_name, basic_block in model_fp32_fused.model.named_children():
    for sub_block in basic_block.children():
        if not isinstance(sub_block, torch.nn.AvgPool2d):
            torch.quantization.fuse_modules(sub_block, [["conv", "bn"]], inplace=True)



#model_fp32_fused = phinet_fuse_modules(model_fp32)
print(model_fp32_fused)

assert model_equivalence(model_1=model_fp32, model_2=model_fp32_fused,  rtol=1e-03, atol=1e-06, device="cpu", num_tests=10, input_size=(1,3,224,224)), "Fused model is not equivalent to the original model!"

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
input_fp32 = torch.randn(5, *input_shape)
model_fp32_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = torch.ao.quantization.convert(model_fp32_prepared)

# run the model, relevant calculations will happen in int8
res = model_int8(input_fp32)

In [None]:
print_size_of_model(model_fp32)
print_size_of_model(model_int8)

In [None]:
t = measure_inference_latency(model_fp32, device = "cuda", input_size=(1,3,224,224), num_samples=1, num_warmups=10)
print(f"FP32 CUDA Inference Latency: {t*1000} ms")

t = measure_inference_latency(model_fp32, device = "cpu", input_size=(1,3,224,224), num_samples=1, num_warmups=10)
print(f"FP32 CPU Inference Latency: {t*1000} ms")

t =measure_inference_latency(model_int8, device = "cpu", input_size=(1,3,224,224), num_samples=1, num_warmups=10)
print(f"INT8 Inference Latency: {t*1000} ms")