This is End-to-End example how to search a subnet from ofa/resnet50 design space with constraint of Ops, then quantizing/compiling it with target device of ZCU102 board. 

## build ofa resnet50 design space

In [1]:
from ofa.model_zoo import ofa_net
ofa_network = ofa_net('ofa_resnet50', pretrained=True)

##  build accuracy predictor

In [2]:
import torch
from ofa.nas.accuracy_predictor import ResNetArchEncoder
from ofa.nas.accuracy_predictor import AccuracyPredictor 
from ofa.utils import download_url

image_size_list = [128, 144, 160, 176, 192, 224, 240, 256]
arch_encoder = ResNetArchEncoder(
	image_size_list=image_size_list, depth_list=ofa_network.depth_list, expand_list=ofa_network.expand_ratio_list,
    width_mult_list=ofa_network.width_mult_list, base_depth_list=ofa_network.BASE_DEPTH_LIST)

#ofa/utils/common_tools.py
acc_predictor_checkpoint_path = download_url(
    'https://hanlab.mit.edu/files/OnceForAll/tutorial/ofa_resnet50_acc_predictor.pth',
    model_dir='~/.ofa/',
)


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
acc_predictor = AccuracyPredictor(arch_encoder, 400, 3,
                                  checkpoint_path=acc_predictor_checkpoint_path, device=device)

print('The accuracy predictor is ready!')
print(acc_predictor)

Loaded checkpoint from /home/vitis-ai-user/.ofa/ofa_resnet50_acc_predictor.pth
The accuracy predictor is ready!
AccuracyPredictor(
  (layers): Sequential(
    (0): Sequential(
      (0): Linear(in_features=82, out_features=400, bias=True)
      (1): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Linear(in_features=400, out_features=400, bias=True)
      (1): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Linear(in_features=400, out_features=400, bias=True)
      (1): ReLU(inplace=True)
    )
    (3): Linear(in_features=400, out_features=1, bias=False)
  )
)


## build efficiency predictor

In [3]:
from ofa.nas.efficiency_predictor import ResNet50FLOPsModel

efficiency_predictor = ResNet50FLOPsModel(ofa_network)

print(efficiency_predictor)

<ofa.nas.efficiency_predictor.ResNet50FLOPsModel object at 0x7fa50867ccc0>


## build evolution finder

In [4]:
import argparse
parser = argparse.ArgumentParser()

args = parser.parse_args(args=[])
args.arch_mutate_prob = 0.1 
args.resolution_mutate_prob = 0.5 
args.population_size = 100 
args.max_time_budget = 50 
args.parent_ratio = 0.25 
args.mutation_ratio = 0.5 

from ofa.nas.search_algorithm import EvolutionFinder

evolution_finder = EvolutionFinder(efficiency_predictor, acc_predictor, **args.__dict__)

## search best subnet

In [11]:
# get best subnet with constraint(Mflops)
# constraint : Mega flops
_, best_info = evolution_finder.run_evolution_search(constraint=2000, verbose=True)
print(best_info)
predicted_acc, arch_dict, efficiency = best_info

Searching with constraint (2000):   0%|          | 0/50 [00:00<?, ?it/s, acc=0.816]

Generate random population...
Start Evolution...


Searching with constraint (2000): 100%|██████████| 50/50 [00:08<00:00,  5.79it/s, acc=0.822]

(0.8222154974937439, {'d': [2, 1, 2, 1, 2], 'e': [0.25, 0.25, 0.25, 0.2, 0.25, 0.25, 0.25, 0.2, 0.2, 0.2, 0.25, 0.35, 0.2, 0.35, 0.25, 0.35, 0.2, 0.2], 'w': [0, 2, 0, 0, 1, 2], 'image_size': 192}, 1998.431744)





## save net_config for the subnet

In [6]:
import pickle

with open('./models/net_config_resnet50_fp2000.pickle', 'wb') as handle:
    pickle.dump(arch_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

## upload the saved subnet (when you need to load the saved model)

In [5]:

import pickle

with open('./models/net_config_resnet50_fp2000.pickle', 'rb') as handle:
    arch_dict = pickle.load(handle)

## extract subnet's weight

In [11]:

ofa_network.set_active_subnet(**arch_dict)
subnet = ofa_network.get_active_subnet(preserve_weight=True)  

from ofa.imagenet_classification.run_manager import ImagenetRunConfig
from ofa.imagenet_classification.run_manager import RunManager

run_config = ImagenetRunConfig(test_batch_size=200, n_worker=4, image_size=image_size_list, valid_size=1000)
run_manager = RunManager('.tmp/eval_subnet', subnet, run_config, init=False)

run_manager.run_config.data_provider.assign_active_img_size(arch_dict['image_size'])
run_manager.reset_running_statistics(subnet, subset_size=1000, subset_batch_size=250)

# evaluate subnet on validation dataset
_, (top1, _) = run_manager.validate(is_test=True)
#print('Test acc: %.3f,\t best_info: %s' % (top1, best_info))
print('Test acc: %.3f' % (top1))

Color jitter: tf, resize_scale: 0.08, img_size: [128, 144, 160, 176, 192, 224, 240, 256]
Use MyRandomResizedCrop: [128, 144, 160, 176, 192, 224, 240, 256], 	 None sync=True, continuous=False
ResNets(
  (input_stem): ModuleList(
    (0): ConvLayer(
      (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (1): ConvLayer(
      (conv): Conv2d(24, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
  )
  (max_pooling): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (blocks): ModuleList(
    (0): ResNetBottleneckBlock(
      (conv1): Sequential(
        (conv): Conv2d(48, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(40, ep

Validate Epoch #1 : 100%|██████████| 15/15 [00:06<00:00,  2.32it/s, loss=1.05, top1=75.3, top5=93, img_size=160]  

Test acc: 75.267





## save the subnet's weight 

In [12]:
print(subnet)
torch.save(subnet.state_dict(), './models/resnet50_fp2000.pth', _use_new_zipfile_serialization=False)        

ResNets(
  (input_stem): ModuleList(
    (0): ConvLayer(
      (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (1): ConvLayer(
      (conv): Conv2d(24, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
  )
  (max_pooling): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (blocks): ModuleList(
    (0): ResNetBottleneckBlock(
      (conv1): Sequential(
        (conv): Conv2d(48, 40, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (conv): Conv2d(40, 40, kernel_size=(3, 3), stride=(1

## quantize the model (subnet)

In [6]:
# find the input image size of the subnet
print(arch_dict['image_size'])

192


In [7]:
# quantization with finetune 
!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode calib --fast_finetune

# general quantization
#!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode calib --subset_len 200


[0;32m[NNDCT_NOTE]: Loading NNDCT kernels...[0m
-------- Start resnet50_fp2000 test 

[0;32m[NNDCT_NOTE]: Quantization calibration process start up...[0m

[0;32m[NNDCT_NOTE]: =>Quant Module is in 'cuda'.[0m

[0;32m[NNDCT_NOTE]: =>Parsing ResNets...[0m

[0;32m[NNDCT_NOTE]: =>Doing weights equalization...[0m

[0;32m[NNDCT_NOTE]: =>Quantizable module is generated.(quantize_result/ResNets.py)[0m

[0;32m[NNDCT_NOTE]: =>Get module with quantization.[0m

[0;32m[NNDCT_NOTE]: =>Preparing data for fast finetuning module parameters ...[0m
100%|███████████████████████████████████████████| 32/32 [00:09<00:00,  3.37it/s]
100%|███████████████████████████████████████████| 32/32 [00:15<00:00,  2.08it/s]

[0;32m[NNDCT_NOTE]: =>Fast finetuning module parameters for better quantization accuracy...[0m
100%|███████████████████████████████████████████| 58/58 [04:38<00:00,  4.80s/it]

[0;32m[NNDCT_NOTE]: =>Export fast finetuned parameters ...[0m

[0;32m[NNDCT_NOTE]: =>Exporting quant mod

## evaluate the quantized model

In [8]:
# general evaluation
#!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode test

# evaluation with finetune
!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode test --fast_finetune


[0;32m[NNDCT_NOTE]: Loading NNDCT kernels...[0m
-------- Start resnet50_fp2000 test 

[0;32m[NNDCT_NOTE]: Quantization test process start up...[0m

[0;32m[NNDCT_NOTE]: =>Quant Module is in 'cuda'.[0m

[0;32m[NNDCT_NOTE]: =>Parsing ResNets...[0m

[0;32m[NNDCT_NOTE]: =>Doing weights equalization...[0m

[0;32m[NNDCT_NOTE]: =>Quantizable module is generated.(quantize_result/ResNets.py)[0m

[0;32m[NNDCT_NOTE]: =>Get module with quantization.[0m

[0;32m[NNDCT_NOTE]: =>Loading quant model parameters.(quantize_result/param.pth)[0m
100%|███████████████████████████████████████████| 94/94 [00:26<00:00,  3.56it/s]
loss: 0.0364622
top-1 / top-5 accuracy: 77.1667 / 94.3667
-------- End of resnet50_fp2000 test 


## Export the quantized model as xmodel

In [9]:

!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode test --subset_len 1 --batch_size=1 --deploy



[0;32m[NNDCT_NOTE]: Loading NNDCT kernels...[0m
-------- Start resnet50_fp2000 test 

[0;32m[NNDCT_NOTE]: Quantization test process start up...[0m

[0;32m[NNDCT_NOTE]: =>Quant Module is in 'cuda'.[0m

[0;32m[NNDCT_NOTE]: =>Parsing ResNets...[0m

[0;32m[NNDCT_NOTE]: =>Doing weights equalization...[0m

[0;32m[NNDCT_NOTE]: =>Quantizable module is generated.(quantize_result/ResNets.py)[0m

[0;32m[NNDCT_NOTE]: =>Get module with quantization.[0m
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  6.69it/s]
loss: 0.367108
top-1 / top-5 accuracy: 100 / 100

[0;32m[NNDCT_NOTE]: =>Converting to xmodel ...[0m

[0;32m[NNDCT_NOTE]: =>Successfully convert 'ResNets' to xmodel.(quantize_result/ResNets_int.xmodel)[0m
-------- End of resnet50_fp2000 test 


## Compile the model

In [18]:
# target hardware: ZCu102
!vai_c_xir -x quantize_result/ResNets_int.xmodel -a /opt/vitis_ai/compiler/arch/DPUCZDX8G/ZCU102/arch.json -o compiled -n resnet50_fp2000


**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[UNILOG][INFO] The compiler log will be dumped at "/tmp/vitis-ai-user/log/xcompiler-20210401-175142-10774"
[UNILOG][INFO] Target architecture: DPUCZDX8G_ISA0_B4096_MAX_BG2
[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: function
[UNILOG][INFO] Target architecture: DPUCZDX8G_ISA0_B4096_MAX_BG2
[UNILOG][INFO] Graph name: ResNets, with op num: 453
[UNILOG][INFO] Begin to compile...
[m[UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/automl/compiled/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/automl/compiled/resnet50_fp2000.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is 8b91249aad96c211390ca06c9e37a5df, and been saved to "/workspace/automl/compiled/md5sum.txt"
