In [5]:
#使用pytorch做训练后量化
import os
import sys
import time
import numpy as np

import torch
from torch.ao.quantization import get_default_qconfig, QConfigMapping
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision
from torchvision import datasets
from torchvision.models.resnet import resnet18
import torchvision.transforms as transforms
from torch.quantization import MinMaxObserver
# Set up warnings
import warnings
warnings.filterwarnings(
    action='ignore',
    category=DeprecationWarning,
    module=r'.*'
)
warnings.filterwarnings(
    action='default',
    module=r'torch.ao.quantization'
)

# Specify random seed for repeatable results
_ = torch.manual_seed(191009)


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


def evaluate(model, criterion, data_loader,device):
    model.eval()
    model.to(device)
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    cnt = 0
    with torch.no_grad():
        for image, target in data_loader:
            image=image.to(device)
            output = model(image)
            loss = criterion(output, target)
            cnt += 1
            acc1, acc5 = accuracy(output, target, topk=(1, 2))
            top1.update(acc1[0], image.size(0))
            top5.update(acc5[0], image.size(0))
            print('Val[',cnt,"]  [top1]:",acc1,"[top5]:",acc5)
    

    return top1, top5

def load_model(model_file):
    model = resnet18(pretrained=False)
    state_dict = torch.load(model_file)
    model.load_state_dict(state_dict)
    model.to("cpu")
    return model

def print_size_of_model(model):
    if isinstance(model, torch.jit.RecursiveScriptModule):
        torch.jit.save(model, "temp.p")
    else:
        torch.jit.save(torch.jit.script(model), "temp.p")
    print("Size (MB):", os.path.getsize("temp.p")/1e6)
    os.remove("temp.p")

def prepare_data_loaders(data_path):
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    dataset = torchvision.datasets.ImageFolder(
        data_path+'/train',transform=transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    dataset_test = torchvision.datasets.ImageFolder(
        data_path+'/val',  transform=transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = torch.utils.data.RandomSampler(dataset)
    test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=train_batch_size,
        sampler=train_sampler)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=eval_batch_size,
        sampler=test_sampler)

    return data_loader, data_loader_test

data_path = 'E:/Transformer/DataSets/imagenet/Mini_Train'
saved_model_dir = 'Export/Ptq'
float_model_file = 'pretrained_float.pth'

train_batch_size = 8
eval_batch_size = 1

data_loader, data_loader_test = prepare_data_loaders(data_path)
example_inputs = (next(iter(data_loader))[0])
criterion = nn.CrossEntropyLoss()

class Submodule(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(768*3, 768*3)
    def forward(self, x):
        x = self.linear(x)
        return x

class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.proj=nn.Conv2d(in_channels=3,out_channels=768, kernel_size=16,stride=16,)
        self.linear = torch.nn.Linear(768, 768*3)
        self.sub = Submodule()

    def forward(self, x):
        x=self.proj(x)
        x=torch.flatten(x,2).transpose(1, 2)
        x = self.linear(x)
        x = self.sub(x) + x
        return x
float_model=M()
float_model.eval()
float_model.to('cuda')
# deepcopy the model since we need to keep the original model around
import copy
model_to_quantize = copy.deepcopy(float_model)
model_to_quantize.eval()



M(
  (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (linear): Linear(in_features=768, out_features=2304, bias=True)
  (sub): Submodule(
    (linear): Linear(in_features=2304, out_features=2304, bias=True)
  )
)

In [6]:
# qconfig_mapping = QConfigMapping.set_global(default_qconfig)
# qconfig_opt=None
# qconfig_mapping = (QConfigMapping()
#     .set_global(qconfig_opt)  # qconfig_opt is an optional qconfig, either a valid qconfig or None
#     .set_object_type(torch.nn.Conv2d, qconfig_opt) # can be a callable...
#     .set_object_type("torch.nn.functional.add", qconfig_opt) # ...or a string of the class name
#     .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig_opt) # matched in order, first match takes precedence
#     .set_module_name("foo.bar", qconfig_opt)
#     .set_module_name_object_type_order()
# )
#     # priority (in increasing order): global, object_type, module_name_regex, module_name
#     # qconfig == None means fusion and quantization should be skipped for anything
#     # matching the rule (unless a higher priority match is found)
from torch.ao.quantization.backend_config import DTypeConfig,BackendPatternConfig,ObservationType,BackendConfig
weighted_int8_dtype_config = DTypeConfig(
  input_dtype=torch.quint8,
  output_dtype=torch.quint8,
  weight_dtype=torch.qint8,
  bias_dtype=torch.float)

linear_pattern_config = BackendPatternConfig(torch.nn.Linear) \
   .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
   .add_dtype_config(weighted_int8_dtype_config) \

linear_backend_config = BackendConfig().set_backend_pattern_config(linear_pattern_config)
qconfig = get_default_qconfig("fbgemm")
qlinear_cfg=torch.quantization.QConfig(
   activation=MinMaxObserver.with_args(dtype=torch.qint8),
   weight=MinMaxObserver.with_args(dtype=torch.qint8))
qconfig_mapping = QConfigMapping().set_global(qconfig)#.set_object_type(torch.nn.Linear, qlinear_cfg)
prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)#,backend_config=linear_backend_config)
print(prepared_model.graph)


graph():
    %x : [#users=1] = placeholder[target=x]
    %activation_post_process_0 : [#users=1] = call_module[target=activation_post_process_0](args = (%x,), kwargs = {})
    %proj : [#users=1] = call_module[target=proj](args = (%activation_post_process_0,), kwargs = {})
    %activation_post_process_1 : [#users=1] = call_module[target=activation_post_process_1](args = (%proj,), kwargs = {})
    %flatten : [#users=1] = call_function[target=torch.flatten](args = (%activation_post_process_1, 2), kwargs = {})
    %activation_post_process_2 : [#users=1] = call_module[target=activation_post_process_2](args = (%flatten,), kwargs = {})
    %transpose : [#users=1] = call_method[target=transpose](args = (%activation_post_process_2, 1, 2), kwargs = {})
    %activation_post_process_3 : [#users=1] = call_module[target=activation_post_process_3](args = (%transpose,), kwargs = {})
    %linear : [#users=1] = call_module[target=linear](args = (%activation_post_process_3,), kwargs = {})
    %activation



In [7]:
def calibrate(model, data_loader,device):
    model.eval()
    model.to(device)
    i=0
    with torch.no_grad():
        for image, target in data_loader:
            image=image.to(device)
            target=target.to(device)
            model(image)
            
            i+=1       
    print("calibrate times end",len(data_loader)) 
calibrate(prepared_model, data_loader_test,'cuda')  # run calibration on sample data
                #很奇怪，这里得用cpu校准，用gpu校准下一步就过不去了，，，，，，，，，，
print("===========calibrate end===========")
prepared_model.to('cpu')#这里得改成cpu，很奇怪
quantized_model = convert_fx(prepared_model)
print(quantized_model)

print("Size of model before quantization")
print_size_of_model(float_model)
print("Size of model after quantization")
print_size_of_model(quantized_model)
# print(quantized_model.parameters)

# test_input=torch.rand(1,3,224,224).to('cuda')
# quantized_model.to('cuda')
# out=quantized_model(test_input)



calibrate times end 150
GraphModule(
  (proj): QuantizedConv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), scale=0.06182935833930969, zero_point=63)
  (linear): QuantizedLinear(in_features=768, out_features=2304, scale=0.04096301272511482, zero_point=70, qscheme=torch.per_channel_affine)
  (sub): Module(
    (linear): QuantizedLinear(in_features=2304, out_features=2304, scale=0.025224722921848297, zero_point=61, qscheme=torch.per_channel_affine)
  )
)



def forward(self, x):
    proj_input_scale_0 = self.proj_input_scale_0
    proj_input_zero_point_0 = self.proj_input_zero_point_0
    quantize_per_tensor = torch.quantize_per_tensor(x, proj_input_scale_0, proj_input_zero_point_0, torch.quint8);  x = proj_input_scale_0 = proj_input_zero_point_0 = None
    proj = self.proj(quantize_per_tensor);  quantize_per_tensor = None
    flatten = torch.flatten(proj, 2);  proj = None
    transpose = flatten.transpose(1, 2);  flatten = None
    linear = self.linear(transpose);  transpose = None
  

In [16]:
print(quantized_model.linear.weight())


tensor([[-0.0314,  0.0223,  0.0167,  ..., -0.0237,  0.0266,  0.0059],
        [ 0.0282, -0.0056,  0.0240,  ..., -0.0350, -0.0062, -0.0209],
        [-0.0181,  0.0280,  0.0057,  ...,  0.0062, -0.0119,  0.0144],
        ...,
        [ 0.0175, -0.0195,  0.0201,  ...,  0.0243,  0.0000, -0.0122],
        [-0.0189, -0.0359, -0.0020,  ..., -0.0116,  0.0212, -0.0345],
        [-0.0172, -0.0147,  0.0113,  ..., -0.0223, -0.0212, -0.0280]],
       size=(2304, 768), dtype=torch.qint8,
       quantization_scheme=torch.per_channel_affine,
       scale=tensor([0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
       dtype=torch.float64),
       zero_point=tensor([0, 0, 0,  ..., 0, 0, 0]), axis=0)
