In [1]:
import hls4ml

2023-10-01 17:47:22.433113: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.






In [2]:
import tensorflow as tf
from tensorflow.keras.layers import MaxPooling2D, Input, concatenate, Conv2D, Activation, ZeroPadding2D, UpSampling2D, add
from tensorflow.keras.layers import Flatten
from tensorflow.keras.models import *
from qkeras import *

import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.image import load_img

import os
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import shuffle
from tqdm import tqdm

In [3]:
import tensorflow_model_optimization as tfmot

In [4]:
# import load_model
import datetime
import os
from enum import Enum
import cv2
import numpy as np
from PIL import Image
from keras import backend as K
from keras.models import load_model
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from keras.utils import to_categorical

In [5]:
BASE_PATH = ""
data_dir = BASE_PATH+''

## Model Loading

In [9]:
from qkeras.utils import _add_supported_quantized_objects
from tensorflow_model_optimization.python.core.sparsity.keras import pruning_wrapper

model_dir = "/nfs_scratch/hsharma/MachineLearning/ClusterFinder/notebooks/wandb/run-20230711_144156-d0m662qq/files/"
model_name = 'model-best.h5'

model = load_model(
    model_dir + model_name,

)

In [21]:
class UNETLiteV4Model:
    '''
    Build UNET based model for segmentation task.
    '''
    def prepare_model(self, OUTPUT_CHANNEL, input_size=(30,30,1)):
        inputs = Input(input_size)

        conv1, pool1 = self.__ConvBlock(4, (3,3), (2,2), 'relu', 'same', inputs) 
        conv2, up3 = self.__UpConvBlock(8, 4, (3,3), (2,2), (2,2), 'relu', 'same', pool1, conv1)

        conv4 = self.__ConvBlock(4, (3,3), (2,2), 'relu', 'same', up3, False)

        outputs = Conv2D(OUTPUT_CHANNEL, (3, 3), activation='softmax', padding='same')(conv4)

        return Model(inputs=[inputs], outputs=[outputs])  

    def __ConvBlock(self, filters, kernel_size, pool_size, activation, padding, connecting_layer, pool_layer=True):
        conv = Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(connecting_layer)
        conv = Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(conv)
        if pool_layer:
            pool = MaxPooling2D(pool_size)(conv)
            return conv, pool
        else:
            return conv

    def __UpConvBlock(self, filters, up_filters, kernel_size, up_kernel, up_stride, activation, padding, connecting_layer, shared_layer):
        conv = Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(connecting_layer)
        conv = Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(conv)
        up = UpSampling2D((2,2))(conv)
        up = Conv2D(up_filters, (1, 1), activation='relu', kernel_initializer='he_normal', padding='same')(up)
        up = concatenate([up, shared_layer], axis=3)

        return conv, up

In [22]:
# output channel is 3 because we have three classes in our mask
model = UNETLiteV4Model().prepare_model(3, input_size=(30, 30, 1))
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 30, 30, 1)]  0           []                               
                                                                                                  
 conv2d_7 (Conv2D)              (None, 30, 30, 4)    40          ['input_2[0][0]']                
                                                                                                  
 conv2d_8 (Conv2D)              (None, 30, 30, 4)    148         ['conv2d_7[0][0]']               
                                                                                                  
 max_pooling2d_1 (MaxPooling2D)  (None, 15, 15, 4)   0           ['conv2d_8[0][0]']               
                                                                                            

In [11]:
all_layers_except_last = model.layers[:-1]
model = Model(inputs=model.input, outputs=all_layers_except_last[-1].output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10, 10, 1)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 10, 10, 4)    40          ['input_1[0][0]']                
                                                                                                  
 conv2d_1 (Conv2D)              (None, 10, 10, 4)    148         ['conv2d[0][0]']                 
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 5, 5, 4)      0           ['conv2d_1[0][0]']               
                                                                                              

In [24]:
#First, the baseline model
hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name')

# Set the precision and reuse factor for the full model
hls_config['Model']['Precision'] = 'ap_fixed<16,6>'
hls_config['Model']['ReuseFactor'] = 1

# Create an entry for each layer, here you can for instance change the strategy for a layer to 'resource' 
# or increase the reuse factor individually for large layers.
# In this case, we designed the model to be small enough for a fully parallel implementation 
# so we use the latency strategy and reuse factor of 1 for all layers.
for Layer in hls_config['LayerName'].keys():
    hls_config['LayerName'][Layer]['Strategy'] = 'latency'
    hls_config['LayerName'][Layer]['ReuseFactor'] = 1
#If you want best numerical performance for high-accuray models, while the default latency strategy is faster but numerically more unstable
sm_layer = list(hls_config["LayerName"].keys())[-1]
hls_config['LayerName'][sm_layer]['Strategy'] = 'stable'
#plotting.print_dict(hls_config)

cfg = hls4ml.converters.create_config(backend='Vivado')
cfg['IOType']     = 'io_stream' # Must set this if using CNNs!
cfg['HLSConfig']  = hls_config
cfg['KerasModel'] = model
cfg['OutputDir']  = 'fpga_1800_30x30_xcvu13_nclock/'
cfg['Part'] = 'xcvu13p-flga2577-2-e'
cfg['ClockPeriod'] = 2.777778
cfg['ClockUncertainty'] = "30.0%"
#cfg['XilinxPart'] = 'xcu250-figd2104-2L-e'
#cfg['XilinxPart'] = "xczu9eg-2ffvb1156"

  
hls_model = hls4ml.converters.keras_to_hls(cfg)

Interpreting Model
Topology:
Layer name: input_2, layer type: InputLayer, input shapes: [[None, 30, 30, 1]], output shape: [None, 30, 30, 1]
Layer name: conv2d_7, layer type: Conv2D, input shapes: [[None, 30, 30, 1]], output shape: [None, 30, 30, 4]
Layer name: conv2d_8, layer type: Conv2D, input shapes: [[None, 30, 30, 4]], output shape: [None, 30, 30, 4]
Layer name: max_pooling2d_1, layer type: MaxPooling2D, input shapes: [[None, 30, 30, 4]], output shape: [None, 15, 15, 4]
Layer name: conv2d_9, layer type: Conv2D, input shapes: [[None, 15, 15, 4]], output shape: [None, 15, 15, 8]
Layer name: conv2d_10, layer type: Conv2D, input shapes: [[None, 15, 15, 8]], output shape: [None, 15, 15, 8]
Layer name: up_sampling2d, layer type: UpSampling2D, input shapes: [[None, 15, 15, 8]], output shape: [None, 30, 30, 8]
Layer name: conv2d_11, layer type: Conv2D, input shapes: [[None, 30, 30, 8]], output shape: [None, 30, 30, 4]
Layer name: concatenate_1, layer type: Concatenate, input shapes: [[No

In [19]:
hls4ml.model.optimizer.get_optimizer(
    'output_rounding_saturation_mode'
).configure(
    layers=['relu1', 'relu2'],
    rounding_mode='AP_RND',
    saturation_mode='AP_SAT',
    saturation_bits='AP_SAT'
)
hls4ml.model.optimizer.get_optimizer(
    'eliminate_linear_activation'
)

hls_config = hls4ml.utils.config_from_keras_model(
    model,
    granularity='name'
)

hls_config['Model']['ReuseFactor'] = 1
hls_config['Model']['Strategy'] = 'Resource'
hls_config['Model']['ClockPeriod']  = 6.25
hls_config['Model']['Precision'] = 'ap_fixed<16, 6>'
# hls_config['Model']['Trace']  = True

for layer in hls_config['LayerName'].keys():
    hls_config['LayerName'][layer]['Strategy'] = 'Resource'
    hls_config['LayerName'][layer]['ReuseFactor'] = 4
    # hls_config['LayerName'][layer]['Trace'] = True

hls_config['LayerName']['input_1']['Precision']['accum'] = 'ap_uint<10>'
hls_config['LayerName']['input_1']['Precision']['result'] = 'ap_uint<10>'

for layer in hls_config['LayerName'].keys():
    if "conv2d" in layer:
        hls_config['LayerName'][layer]['ParallelizationFactor'] = 3
        hls_config['LayerName'][layer]['accum'] = 'ap_fixed<20, 8>'
        hls_config['LayerName'][layer]['result'] = 'ap_fixed<15, 8>'

cfg = hls4ml.converters.create_config(part="xc7vx690tffg1927-2")

cfg['IOType'] = 'io_parallel'
cfg['HLSConfig'] = hls_config
cfg['KerasModel'] = model
cfg['ClockPeriod']  = 6.25
cfg['OutputDir']  = 'fpga_model_700_cnn_wosm/'
cfg['Part'] = 'xc7vx690tffg1927-2'

hls_model = hls4ml.converters.keras_to_hls(cfg)

Interpreting Model
Topology:
Layer name: input_1, layer type: InputLayer, input shapes: [[None, 10, 10, 1]], output shape: [None, 10, 10, 1]
Layer name: conv2d, layer type: Conv2D, input shapes: [[None, 10, 10, 1]], output shape: [None, 10, 10, 3]
Layer name: conv2d_1, layer type: Conv2D, input shapes: [[None, 10, 10, 3]], output shape: [None, 10, 10, 3]
Layer name: max_pooling2d, layer type: MaxPooling2D, input shapes: [[None, 10, 10, 3]], output shape: [None, 5, 5, 3]
Layer name: conv2d_2, layer type: Conv2D, input shapes: [[None, 5, 5, 3]], output shape: [None, 5, 5, 4]
Layer name: conv2d_3, layer type: Conv2D, input shapes: [[None, 5, 5, 4]], output shape: [None, 5, 5, 4]
Layer name: up_sampling2d, layer type: UpSampling2D, input shapes: [[None, 5, 5, 4]], output shape: [None, 10, 10, 4]
Layer name: conv2d_4, layer type: Conv2D, input shapes: [[None, 10, 10, 4]], output shape: [None, 10, 10, 4]
Layer name: concatenate, layer type: Concatenate, input shapes: [[None, 10, 10, 4], [Non

In [None]:
hls_model.compile()

Writing HLS project
Done


In [None]:
hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file="UNL_12distill.png")

In [None]:
from hls4ml.model import profiling

In [None]:
profiling.numerical(model=model, hls_model=hls_model)

In [None]:
from qkeras.qtools import run_qtools
from qkeras.qtools import settings as qtools_settings
from tensorflow_model_optimization.python.core.sparsity.keras import pruning_wrapper
from qkeras import quantized_bits
from qkeras import QDense, QActivation
import pprint

In [None]:
# Energy Profiling dmodel: Student w/o Quantization
q = run_qtools.QTools(model, 
                      process="horowitz", 
                      source_quantizers=[quantized_bits(16, 5, 1)], 
                      is_inference=True, 
                      weights_path=None,
                      keras_quantizer="fp16",
                      keras_accumulator="fp16", 
                      for_reference=False)
q.qtools_stats_print()

energy_dict = q.pe(
    weights_on_memory="fixed",
    activations_on_memory="fixed",
    min_sram_size=8*16*1024*1024,
    rd_wr_on_io=False)

# get stats of energy distribution in each layer
energy_profile = q.extract_energy_profile(
    qtools_settings.cfg.include_energy, energy_dict)
# extract sum of energy of each layer according to the rule specified in
# qtools_settings.cfg.include_energy
total_energy = q.extract_energy_sum(
    qtools_settings.cfg.include_energy, energy_dict)

pprint.pprint(energy_profile)
print()

print("Total energy: {:.6f} uJ".format(total_energy / 1000000.0))