This is End-to-End example how to search a subnet from ofa/resnet50 design space with constraint of Ops, then quantizing/compiling it with target device of ZCU102 board. 

## build ofa resnet50 design space

In [1]:
from ofa.model_zoo import ofa_net
ofa_network = ofa_net('ofa_resnet50', pretrained=True)

Downloading: "https://hanlab.mit.edu/files/OnceForAll/ofa_nets/ofa_resnet50_d=0+1+2_e=0.2+0.25+0.35_w=0.65+0.8+1.0" to .torch/ofa_nets/ofa_resnet50_d=0+1+2_e=0.2+0.25+0.35_w=0.65+0.8+1.0


##  build accuracy predictor

In [2]:
import torch
from ofa.nas.accuracy_predictor import ResNetArchEncoder
from ofa.nas.accuracy_predictor import AccuracyPredictor 
from ofa.utils import download_url

image_size_list = [128, 144, 160, 176, 192, 224, 240, 256]
arch_encoder = ResNetArchEncoder(
	image_size_list=image_size_list, depth_list=ofa_network.depth_list, expand_list=ofa_network.expand_ratio_list,
    width_mult_list=ofa_network.width_mult_list, base_depth_list=ofa_network.BASE_DEPTH_LIST)

#ofa/utils/common_tools.py
acc_predictor_checkpoint_path = download_url(
    'https://hanlab.mit.edu/files/OnceForAll/tutorial/ofa_resnet50_acc_predictor.pth',
    model_dir='~/.ofa/',
)


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
acc_predictor = AccuracyPredictor(arch_encoder, 400, 3,
                                  checkpoint_path=acc_predictor_checkpoint_path, device=device)

print('The accuracy predictor is ready!')
print(acc_predictor)

Downloading: "https://hanlab.mit.edu/files/OnceForAll/tutorial/ofa_resnet50_acc_predictor.pth" to /home/vitis-ai-user/.ofa/ofa_resnet50_acc_predictor.pth


Loaded checkpoint from /home/vitis-ai-user/.ofa/ofa_resnet50_acc_predictor.pth
The accuracy predictor is ready!
AccuracyPredictor(
  (layers): Sequential(
    (0): Sequential(
      (0): Linear(in_features=82, out_features=400, bias=True)
      (1): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Linear(in_features=400, out_features=400, bias=True)
      (1): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Linear(in_features=400, out_features=400, bias=True)
      (1): ReLU(inplace=True)
    )
    (3): Linear(in_features=400, out_features=1, bias=False)
  )
)


## build efficiency predictor

In [3]:
from ofa.nas.efficiency_predictor import ResNet50FLOPsModel

efficiency_predictor = ResNet50FLOPsModel(ofa_network)

print(efficiency_predictor)

<ofa.nas.efficiency_predictor.ResNet50FLOPsModel object at 0x7f07dbc7dd50>


## build evolution finder

In [4]:
import argparse
parser = argparse.ArgumentParser()

args = parser.parse_args(args=[])
args.arch_mutate_prob = 0.1 
args.resolution_mutate_prob = 0.5 
args.population_size = 100 
args.max_time_budget = 50 
args.parent_ratio = 0.25 
args.mutation_ratio = 0.5 

from ofa.nas.search_algorithm import EvolutionFinder

evolution_finder = EvolutionFinder(efficiency_predictor, acc_predictor, **args.__dict__)

## search best subnet

In [5]:
# get best subnet with constraint(Mflops)
# constraint : Mega flops
_, best_info = evolution_finder.run_evolution_search(constraint=2000, verbose=True)
print(best_info)
predicted_acc, arch_dict, efficiency = best_info

Generate random population...
Start Evolution...


Searching with constraint (2000): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.26it/s, acc=0.821]

(0.8209745287895203, {'d': [2, 2, 1, 2, 2], 'e': [0.25, 0.35, 0.35, 0.25, 0.25, 0.25, 0.25, 0.35, 0.25, 0.35, 0.35, 0.35, 0.35, 0.25, 0.35, 0.35, 0.25, 0.25], 'w': [0, 2, 0, 1, 1, 2], 'image_size': 160}, 1984.9216)





## save net_config for the subnet

In [6]:
import pickle

with open('./models/net_config_resnet50_fp2000.pickle', 'wb') as handle:
    pickle.dump(arch_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

## upload the saved subnet (when you need to load the saved model)

In [7]:

import pickle

with open('./models/net_config_resnet50_fp2000.pickle', 'rb') as handle:
    arch_dict = pickle.load(handle)

## extract subnet's weight

In [8]:

ofa_network.set_active_subnet(**arch_dict)
subnet = ofa_network.get_active_subnet(preserve_weight=True)  

from ofa.imagenet_classification.run_manager import ImagenetRunConfig
from ofa.imagenet_classification.run_manager import RunManager

run_config = ImagenetRunConfig(test_batch_size=200, n_worker=4, image_size=image_size_list, valid_size=1000)
run_manager = RunManager('.tmp/eval_subnet', subnet, run_config, init=False)

run_manager.run_config.data_provider.assign_active_img_size(arch_dict['image_size'])
run_manager.reset_running_statistics(subnet, subset_size=1000, subset_batch_size=250)

# evaluate subnet on validation dataset
_, (top1, _) = run_manager.validate(is_test=True)
#print('Test acc: %.3f,\t best_info: %s' % (top1, best_info))
print('Test acc: %.3f' % (top1))

Color jitter: tf, resize_scale: 0.08, img_size: [128, 144, 160, 176, 192, 224, 240, 256]
Use MyRandomResizedCrop: [128, 144, 160, 176, 192, 224, 240, 256], 	 None sync=True, continuous=False
ResNets(
  (input_stem): ModuleList(
    (0): ConvLayer(
      (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (1): ResidualBlock(
      (conv): ConvLayer(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): ReLU(inplace=True)
      )
      (shortcut): IdentityLayer()
    )
    (2): ConvLayer(
      (conv): Conv2d(24, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (a

Validate Epoch #1 : 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:09<00:00,  1.61it/s, loss=0.979, top1=77, top5=93.9, img_size=160]

Test acc: 77.000





## save the subnet's weight 

In [9]:
print(subnet)
torch.save(subnet.state_dict(), './models/resnet50_fp2000.pth', _use_new_zipfile_serialization=False)        

ResNets(
  (input_stem): ModuleList(
    (0): ConvLayer(
      (conv): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
    (1): ResidualBlock(
      (conv): ConvLayer(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): ReLU(inplace=True)
      )
      (shortcut): IdentityLayer()
    )
    (2): ConvLayer(
      (conv): Conv2d(24, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act): ReLU(inplace=True)
    )
  )
  (max_pooling): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (blocks): ModuleList(
    (0): ResNetBottleneckBlock(
      (co

## quantize the model (subnet)

In [10]:
# find the input image size of the subnet
print(arch_dict['image_size'])

160


In [12]:
# quantization with finetune 
!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode calib --fast_finetune

# general quantization
#!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode calib --subset_len 200


[0;32m[VAIQ_NOTE]: Loading NNDCT kernels...[0m
-------- Start resnet50_fp2000 test 

[0;32m[VAIQ_NOTE]: OS and CPU information:
               system --- Linux
                 node --- xsjfislx20
              release --- 4.15.0-166-generic
              version --- #174-Ubuntu SMP Wed Dec 8 19:07:44 UTC 2021
              machine --- x86_64
            processor --- x86_64[0m

[0;32m[VAIQ_NOTE]: Tools version information:
                  GCC --- GCC 9.4.0
               python --- 3.7.12
              pytorch --- 1.12.1
        vai_q_pytorch --- 3.0.0+fec926f+torch1.12.1[0m

[0;32m[VAIQ_NOTE]: GPU information:
          device name --- Tesla V100-PCIE-16GB
     device available --- True
         device count --- 3
       current device --- 0[0m

[0;32m[VAIQ_NOTE]: Quant config file is empty, use default quant configuration[0m

[0;32m[VAIQ_NOTE]: Quantization calibration process start up...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cuda'.[0m

[0;32m[VAIQ_NOTE]: =>

## evaluate the quantized model

In [13]:
# evaluation with finetune
!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode test --fast_finetune

# general evaluation
#!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode test



[0;32m[VAIQ_NOTE]: Loading NNDCT kernels...[0m
-------- Start resnet50_fp2000 test 

[0;32m[VAIQ_NOTE]: OS and CPU information:
               system --- Linux
                 node --- xsjfislx20
              release --- 4.15.0-166-generic
              version --- #174-Ubuntu SMP Wed Dec 8 19:07:44 UTC 2021
              machine --- x86_64
            processor --- x86_64[0m

[0;32m[VAIQ_NOTE]: Tools version information:
                  GCC --- GCC 9.4.0
               python --- 3.7.12
              pytorch --- 1.12.1
        vai_q_pytorch --- 3.0.0+fec926f+torch1.12.1[0m

[0;32m[VAIQ_NOTE]: GPU information:
          device name --- Tesla V100-PCIE-16GB
     device available --- True
         device count --- 3
       current device --- 0[0m

[0;32m[VAIQ_NOTE]: Quant config file is empty, use default quant configuration[0m

[0;32m[VAIQ_NOTE]: Quantization test process start up...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cuda'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing

## Export the quantized model as xmodel

In [14]:

!python ofa_quant.py --model_name 'resnet50_fp2000' --image_size 192 --quant_mode test --subset_len 1 --batch_size=1 --deploy



[0;32m[VAIQ_NOTE]: Loading NNDCT kernels...[0m
-------- Start resnet50_fp2000 test 

[0;32m[VAIQ_NOTE]: OS and CPU information:
               system --- Linux
                 node --- xsjfislx20
              release --- 4.15.0-166-generic
              version --- #174-Ubuntu SMP Wed Dec 8 19:07:44 UTC 2021
              machine --- x86_64
            processor --- x86_64[0m

[0;32m[VAIQ_NOTE]: Tools version information:
                  GCC --- GCC 9.4.0
               python --- 3.7.12
              pytorch --- 1.12.1
        vai_q_pytorch --- 3.0.0+fec926f+torch1.12.1[0m

[0;32m[VAIQ_NOTE]: GPU information:
          device name --- Tesla V100-PCIE-16GB
     device available --- True
         device count --- 3
       current device --- 0[0m

[0;32m[VAIQ_NOTE]: Quant config file is empty, use default quant configuration[0m

[0;32m[VAIQ_NOTE]: Quantization test process start up...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cuda'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing

## Compile the model

In [15]:
# target hardware: ZCu102
!vai_c_xir -x quantize_result/ResNets_int.xmodel -a /opt/vitis_ai/compiler/arch/DPUCZDX8G/ZCU102/arch.json -o compiled -n resnet50_fp2000


**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: null
[UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
[UNILOG][INFO] Graph name: ResNets, with op num: 472
[UNILOG][INFO] Begin to compile...
[m[UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/examples/ofa/ofa_resnet50/compiled/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/examples/ofa/ofa_resnet50/compiled/resnet50_fp2000.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is 27f6954d29fa3333d5b6d0b39a8efc0b, and has been saved to "/workspace/examples/ofa/ofa_resnet50/compiled/md5sum.txt"
