Small Test Net - Alon Tchelet MSc Thesis

# Brevitas

## Imports

In [1]:
# general use libraries
import numpy as np

# Brevitaas ad PyTorch libraries
import torch
from torch.nn import Module, ModuleList, Sequential, Conv2d, Linear, ReLU, MaxPool2d, Flatten
from brevitas.nn import QuantIdentity, QuantConv2d, QuantLinear, QuantReLU, QuantMaxPool2d
from brevitas.inject.defaults import *
from brevitas.inject import *
from brevitas.core.quant import QuantType
from brevitas.core.bit_width import BitWidthImplType
from brevitas.core.scaling import ScalingImplType
from brevitas.core.restrict_val import FloatToIntImplType
from brevitas.quant.solver import WeightQuantSolver, ActQuantSolver
from brevitas.core.restrict_val import RestrictValueType
from brevitas.core.zero_point import ZeroZeroPoint

## Set up network

In [2]:
class MyQuant(ExtendedInjector):
    bit_width_impl_type = BitWidthImplType.CONST
    scaling_impl_type = ScalingImplType.CONST
    float_to_int_impl_type = FloatToIntImplType.ROUND
    restrict_scaling_type = RestrictValueType.FP
    zero_point_impl = ZeroZeroPoint
    narrow_range = True
    quant_delay_steps = 50
    
    @value
    def quant_type(bit_width):
        if bit_width == 1:
            return QuantType.BINARY
        else:
            return QuantType.INT   

class MyActQuant(MyQuant, ActQuantSolver):
    min_val = 0.0
    max_val = 6.0
    signed = False 
    
class MyWeightQuant(MyQuant, WeightQuantSolver):
    scaling_const = 0.1
    signed = True 

In [3]:
class QLeNet(Module):

    # init for CIFAR-10
    def __init__(self, weight_bit_width=8, act_bit_width=8):
        super(QLeNet, self).__init__()
        self.weight_bit_width = int(np.clip(weight_bit_width, 1, 8))
        self.act_bit_width = int(np.clip(act_bit_width, 1, 8))

        self.conv1 = Sequential(
            QuantIdentity(
            act_quant=Int8ActPerTensorFloatMinMaxInit,
            min_val = -1.0,
            max_val = 1.0 - 2.0 ** (-7),
            signed = True,
            restrict_scaling_type=RestrictValueType.POWER_OF_TWO),
            QuantConv2d(3, 6, 5, bias=False, 
                        weight_quant=MyWeightQuant, weight_bit_width=self.weight_bit_width),
            QuantReLU(act_quant=MyActQuant, bit_width=self.act_bit_width),
            QuantMaxPool2d(2, 2))
        self.conv2 = Sequential(
            QuantConv2d(6, 16, 5, bias=False, 
                        weight_quant=MyWeightQuant, weight_bit_width=self.weight_bit_width),
            QuantReLU(act_quant=MyActQuant, bit_width=self.act_bit_width),
            QuantMaxPool2d(2, 2))
        self.flat = Flatten()
        self.fc1 = Sequential(
            QuantLinear(400, 120, bias=True,
                        weight_quant=MyWeightQuant, weight_bit_width=self.weight_bit_width),
            QuantReLU(act_quant=MyActQuant, bit_width=self.act_bit_width))
        self.fc2 = Sequential(
            QuantLinear(120, 84, bias=True,
                        weight_quant=MyWeightQuant, weight_bit_width=self.weight_bit_width),
            QuantReLU(act_quant=MyActQuant, bit_width=self.act_bit_width))
        self.fc3 = QuantLinear(84, 10, bias=False,
                               weight_quant=MyWeightQuant, weight_bit_width=self.weight_bit_width)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.flat(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [4]:
class LeNet(Module):
        # init for CIFAR-10
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = Sequential(
            Conv2d(3, 6, 5),
            ReLU(),
            MaxPool2d(2, 2))
        self.conv2 = Sequential(
            Conv2d(6, 16, 5),
            ReLU(),
            MaxPool2d(2, 2))
        self.flat = Flatten()
        self.fc1 = Sequential(
            Linear(400, 120),
            ReLU())
        self.fc2 = Sequential(
            Linear(120, 84),
            ReLU())
        self.fc3 = Linear(84, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.flat(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [5]:
lenets = []
lenets_names = []

# for i in range(2, 9):
#     lenets.append(QLeNet(weight_bit_width=i, act_bit_width=i))
#     lenets_names.append(f"lenet_w{i}a{i}")
    
# for i in range(2, 8):
#     lenets.append(QLeNet(weight_bit_width=i+1, act_bit_width=i))
#     lenets_names.append(f"lenet_w{i+1}a{i}")

# for i in range(1, 8):
#     lenets.append(QLeNet(weight_bit_width=i, act_bit_width=i+1))
#     lenets_names.append(f"lenet_w{i}a{i+1}")
    
# for i in range(2, 7):
#     lenets.append(QLeNet(weight_bit_width=i+2, act_bit_width=i))
#     lenets_names.append(f"lenet_w{i+2}a{i}")

# for i in range(1, 7):
#     lenets.append(QLeNet(weight_bit_width=i, act_bit_width=i+2))
#     lenets_names.append(f"lenet_w{i}a{i+2}")

lenet_w3a4 = QLeNet(weight_bit_width=3, act_bit_width=4)
lenets.append(lenet_w3a4)
lenets_names.append("lenet_w3a4")
lenet_w4a3 = QLeNet(weight_bit_width=4, act_bit_width=3)
lenets.append(lenet_w4a3)
lenets_names.append("lenet_w4a3")
lenet_w2a4 = QLeNet(weight_bit_width=2, act_bit_width=4)
lenets.append(lenet_w2a4)
lenets_names.append("lenet_w2a4")
lenet_w2a3 = QLeNet(weight_bit_width=2, act_bit_width=3)
lenets.append(lenet_w2a3)
lenets_names.append("lenet_w2a3")

lenets.append(LeNet())
lenets_names.append("lenet_base")

## Training

In [6]:
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import SubsetRandomSampler as Sampler

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 32
split = .9


trainset = torchvision.datasets.CIFAR10(root="./data", train=True,
                                        download=True, transform=transform)
validset = torchvision.datasets.CIFAR10(root="./data", train=True,
                                        download=True, transform=transform)

train_len = len(trainset)
split = int(split*train_len)
idx = list(range(train_len))
train_idx, valid_idx = idx[split:], idx[:split]
train_samples = Sampler(train_idx)
valid_samples = Sampler(valid_idx)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                           sampler=train_samples, num_workers=2)
validloader = torch.utils.data.DataLoader(validset, batch_size=batch_size,
                                          sampler=valid_samples, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="./data", train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 
           'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [7]:
import torch.optim as optim

criterion = torch.nn.CrossEntropyLoss()
optimizers = []
lr = 0.001
epochs = 13

for net in lenets:
    optimizers.append(optim.Adam(net.parameters(), lr=lr, weight_decay=0.))

In [8]:
for n, net in enumerate(lenets):
    print(f"Started training for net: {lenets_names[n]}")
    for epoch in range(epochs):  # loop over the dataset multiple times

        train_loss = valid_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizers[n].zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizers[n].step()

            # print statistics
            train_loss += loss.item()
        train_loss /= i
        with torch.no_grad():
            for i, data in enumerate(validloader, 0):
                images, labels = data
                outputs = net(images)
                vloss = criterion(outputs, labels)
                valid_loss += vloss.item()
            valid_loss /= i
        print(f"[Epoch: {epoch+1}] training loss: {train_loss:.6f} - valid loss: {valid_loss:.6f}")
        train_loss = valid_loss = 0.0
        
#         correct = 0
#         top3 = 0
#         top5 = 0
#         total = 0
#         with torch.no_grad():
#             for data in testloader:
#                 images, labels = data
#                 outputs = net(images)
#                 _, predicted = torch.max(outputs.data, 1)
#                 top3_outputs = torch.argsort(outputs.data, 1)[:, -3:]
#                 top5_outputs = torch.argsort(outputs.data, 1)[:, -5:]
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()
#                 top3 += sum([(out==labels[i]).sum() for i, out in enumerate(top3_outputs)])
#                 top5 += sum([(out==labels[i]).sum() for i, out in enumerate(top5_outputs)])
#         print(f"[Epoch: {epoch+1}] Network Accuracy:\tTop-1: {100 * correct / total : .2f},\tTop-3: {100 * top3 / total : .2f},\tTop-5: {100 * top5 / total : .2f}")
            
    print(f"Finished training for net: {lenets_names[n]}\n")

print("Finished Training")

Started training for net: lenet_w3a4


  Variable._execution_engine.run_backward(


[Epoch: 1] training loss: 2.082113 - valid loss: 1.933648
[Epoch: 2] training loss: 1.847574 - valid loss: 1.736572
[Epoch: 3] training loss: 1.704948 - valid loss: 1.663259
[Epoch: 4] training loss: 1.618376 - valid loss: 1.616328
[Epoch: 5] training loss: 1.550714 - valid loss: 1.607521
[Epoch: 6] training loss: 1.498649 - valid loss: 1.561098
[Epoch: 7] training loss: 1.451308 - valid loss: 1.545549
[Epoch: 8] training loss: 1.398344 - valid loss: 1.564969
[Epoch: 9] training loss: 1.362514 - valid loss: 1.519654
[Epoch: 10] training loss: 1.313594 - valid loss: 1.550168
[Epoch: 11] training loss: 1.292945 - valid loss: 1.510521
[Epoch: 12] training loss: 1.259286 - valid loss: 1.516885
[Epoch: 13] training loss: 1.230766 - valid loss: 1.494909
Finished training for net: lenet_w3a4

Started training for net: lenet_w4a3
[Epoch: 1] training loss: 2.077851 - valid loss: 1.864883
[Epoch: 2] training loss: 1.809396 - valid loss: 1.804215
[Epoch: 3] training loss: 1.692304 - valid loss: 1

In [9]:
# save network
for n, net in enumerate(lenets):
    path = f"./{lenets_names[n]}.pth"
    torch.save(net.state_dict(), path)

## Testing

In [10]:
dataiter = iter(testloader)
images, labels = dataiter.next()

# if you need to load the networks
#
# lenet_og = QLeNet()
# lenet_w8 = QLeNet(weight_bit_width=8)
# lenet_w4 = QLeNet(weight_bit_width=4)
# lenet_w2 = QLeNet(weight_bit_width=2)
# lenet_w8a8 = QLeNet(weight_bit_width=8, act_bit_width=8)
# lenet_w4a4 = QLeNet(weight_bit_width=4, act_bit_width=4)
# lenet_w2a2 = QLeNet(weight_bit_width=2, act_bit_width=2)
# lenets = [lenet_w8a8, lenet_w4a4, lenet_w2a2, lenet_w8, lenet_w4, lenet_w2, lenet_og]
# lenets_names = ["lenet_w8a8", "lenet_w4a4", "lenet_w2a2", "lenet_w8", "lenet_w4", "lenet_w2", "lenet_og"]
# for n, net in enumerate(lenets):
#     path = f'./{lenets_names[n]}.pth'
#     net.load_state_dict(torch.load(path))
#

print(f"Learning rate: {lr},\tBatch size: {batch_size},\tIterations: {epochs}")
for n, net in enumerate(lenets):    
    correct = 0
    top3 = 0
    top5 = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            top3_outputs = torch.argsort(outputs.data, 1)[:, -3:]
            top5_outputs = torch.argsort(outputs.data, 1)[:, -5:]
#             print(f'Predicted: {predicted} - Labels: {labels}\n')
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            top3 += sum([(out==labels[i]).sum() for i, out in enumerate(top3_outputs)])
            top5 += sum([(out==labels[i]).sum() for i, out in enumerate(top5_outputs)])
    print(f"{lenets_names[n]} Network Accuracy:\tTop-1: {100 * correct / total : .2f},\tTop-3: {100 * top3 / total : .2f},\tTop-5: {100 * top5 / total : .2f}")

Learning rate: 0.001,	Batch size: 32,	Iterations: 13
lenet_w3a4 Network Accuracy:	Top-1:  46.44,	Top-3:  79.86,	Top-5:  92.15
lenet_w4a3 Network Accuracy:	Top-1:  45.50,	Top-3:  78.32,	Top-5:  91.18
lenet_w2a4 Network Accuracy:	Top-1:  45.17,	Top-3:  78.42,	Top-5:  91.04
lenet_w2a3 Network Accuracy:	Top-1:  43.29,	Top-3:  78.17,	Top-5:  91.16
lenet_base Network Accuracy:	Top-1:  49.42,	Top-3:  80.04,	Top-5:  92.13


In [5]:
lenets = []
lenets_names = []

lenet_w3a4 = QLeNet(weight_bit_width=3, act_bit_width=4)
lenets.append(lenet_w3a4)
lenets_names.append("lenet_w3a4")
lenet_w4a3 = QLeNet(weight_bit_width=4, act_bit_width=3)
lenets.append(lenet_w4a3)
lenets_names.append("lenet_w4a3")
lenet_w2a4 = QLeNet(weight_bit_width=2, act_bit_width=4)
lenets.append(lenet_w2a4)
lenets_names.append("lenet_w2a4")
lenet_w2a3 = QLeNet(weight_bit_width=2, act_bit_width=3)
lenets.append(lenet_w2a3)
lenets_names.append("lenet_w2a3")

lenets.append(LeNet())
lenets_names.append("lenet_base")

for n, net in enumerate(lenets):
    path = f'./{lenets_names[n]}.pth'
    net.load_state_dict(torch.load(path))

# FINN

## Imports

In [6]:
# FINN-Brevitas imports
import brevitas.onnx as bo

# ONNX libraries
import onnx
import onnx.numpy_helper as nph
import onnxruntime as rt

# Network display methods - Netron
from finn.util.visualization import showInNetron

# FINN Network Preperation imports
from finn.core.modelwrapper import ModelWrapper
from finn.transformation.infer_datatypes import InferDataTypes
from finn.transformation.infer_shapes import InferShapes
from finn.transformation.fold_constants import FoldConstants
from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from finn.util.pytorch import ToTensor
from finn.transformation.merge_onnx_models import MergeONNXModels
from finn.core.datatype import DataType
from finn.transformation.insert_topk import InsertTopK
from finn.transformation.streamline import Streamline
from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
import finn.transformation.streamline.absorb as absorb
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from finn.transformation.infer_data_layouts import InferDataLayouts
from finn.transformation.general import RemoveUnusedTensors
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
from finn.custom_op.registry import getCustomOp

## Brevitas Export

In [7]:
for n, net in enumerate(lenets):
    print(f"exporting {lenets_names[n]}")
    onnx_export_path = f"./onnx/{lenets_names[n]}.onnx"
    bo.export_finn_onnx(net, (1, 3, 32, 32), onnx_export_path)

exporting lenet_w3a4
exporting lenet_w4a3
exporting lenet_w2a4
exporting lenet_w2a3
exporting lenet_base


## Network Preperation

In [8]:
# select which network to work with
net_n = 0

In [9]:
toDisplay = True
# display net through Netron
if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}.onnx")

Serving './onnx/lenet_w3a4.onnx' at http://0.0.0.0:8081


### Tidy ONNX Model

In [10]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}.onnx")
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())

model.save(f"./onnx/{lenets_names[net_n]}_tidy.onnx")
if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_tidy.onnx")

Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_tidy.onnx' at http://0.0.0.0:8081


### Add Pre/Post-Processing

In [11]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_tidy.onnx")

# pre-processing
in_name = model.graph.input[0].name
in_shape = model.get_tensor_shape(in_name)
totensor = ToTensor()
bo.export_finn_onnx(totensor, in_shape, f"./onnx/{lenets_names[net_n]}_pre.onnx")
pre_model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_pre.onnx")
model = model.transform(MergeONNXModels(pre_model))
in_name = model.graph.input[0].name
model.set_tensor_datatype(in_name, DataType.UINT8)

# post-processing
model = model.transform(InsertTopK(k=1))
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())

model.save(f"./onnx/{lenets_names[net_n]}_pre_post.onnx")
if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_pre_post.onnx")



Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_pre_post.onnx' at http://0.0.0.0:8081


### Streamline

In [12]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_pre_post.onnx")
model = model.transform(MoveScalarLinearPastInvariants())
model = model.transform(Streamline())
model = model.transform(LowerConvsToMatMul())
model = model.transform(MakeMaxPoolNHWC())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(MakeMaxPoolNHWC())
# model = model.transform(absorb.AbsorbTransposeIntoFlatten())
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(Streamline())
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())

model.save(f"./onnx/{lenets_names[net_n]}_streamline.onnx")
if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_streamline.onnx")

Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_streamline.onnx' at http://0.0.0.0:8081


### convert to HLS

In [13]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_streamline.onnx")
model = model.transform(to_hls.InferQuantizedStreamingFCLayer())
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(absorb.AbsorbConsecutiveTransposes())
model = model.transform(to_hls.InferConvInpGen())
model = model.transform(to_hls.InferStreamingMaxPool())
model = model.transform(RemoveCNVtoFCFlatten())
model = model.transform(to_hls.InferLabelSelectLayer())
model = model.transform(InferDataLayouts())

model.save(f"./onnx/{lenets_names[net_n]}_hls.onnx")
if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_hls.onnx")



Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_hls.onnx' at http://0.0.0.0:8081


### Create Dataflow Partition

In [14]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_hls.onnx")
parent_model = model.transform(CreateDataflowPartition())
parent_model.save(f"./onnx/{lenets_names[net_n]}_dataflow_parent.onnx")

if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_dataflow.onnx")

Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_dataflow.onnx' at http://0.0.0.0:8081


In [15]:
parent_model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_dataflow_parent.onnx")
sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save(f"./onnx/{lenets_names[net_n]}_dataflow_model.onnx")

if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_dataflow_model.onnx")

Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_dataflow_model.onnx' at http://0.0.0.0:8081


### Folding

In [16]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_dataflow_model.onnx")
layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
# fc0w = getCustomOp(fc0)
# print("CustomOp wrapper is of class " + fc0w.__class__.__name__)
for i, layer in enumerate(layers):
    temp_op = getCustomOp(layer)
    print(f"CustomOp wrapper is of class StreamingFCLayer_Batch #{i+1}")
    for item in temp_op.get_nodeattr_types():
        print(f"{item}: {temp_op.get_nodeattr_types()[item]} = {temp_op.get_nodeattr(item)}")
    print()

CustomOp wrapper is of class StreamingFCLayer_Batch #1
PE: ('i', True, 0) = 1
SIMD: ('i', True, 0) = 1
MW: ('i', True, 0) = 75
MH: ('i', True, 0) = 6
resType: ('s', False, 'lut', {'auto', 'dsp', 'lut'}) = lut
ActVal: ('i', False, 0) = 0
inputDataType: ('s', True, '') = INT8
weightDataType: ('s', True, '') = INT3
outputDataType: ('s', True, '') = UINT4
accDataType: ('s', False, 'INT32') = INT16
binaryXnorMode: ('i', False, 0, {0, 1}) = 0
noActivation: ('i', False, 0, {0, 1}) = 0
numInputVectors: ('ints', False, [1]) = [1, 28, 28]
mem_mode: ('s', False, 'const', {'external', 'const', 'decoupled'}) = const
ram_style: ('s', False, 'auto', {'ultra', 'auto', 'distributed', 'block'}) = auto
runtime_writeable_weights: ('i', False, 0, {0, 1}) = 0
backend: ('s', True, 'fpgadataflow') = fpgadataflow
code_gen_dir_cppsim: ('s', False, '') = 
code_gen_dir_ipgen: ('s', False, '') = 
executable_path: ('s', False, '') = 
ipgen_path: ('s', False, '') = 
ip_path: ('s', False, '') = 
ip_vlnv: ('s', False,

In [17]:
model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_dataflow_model.onnx")

# set convolution-input (sliding window) layers folding factors
sw_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
sw_folding = [3, 
              2]
for layer, simd in zip(sw_layers, sw_folding):
    fcl_inst = getCustomOp(layer)
    fcl_inst.set_nodeattr("SIMD", simd)
    
# set fully-connected layers folding factors
fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
fc_folding = [
    (1, 3, 16),
    (2, 2, 16),
    (1, 4, 16),
    (1, 1, 16),
    (1, 1, 16)
]
for layer, (pe, simd, ififo) in zip(fc_layers, fc_folding):
    fcl_inst = getCustomOp(layer)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepth", ififo)
    
model = model.transform(GiveUniqueNodeNames())
model.save(f"./onnx/{lenets_names[net_n]}_folded.onnx")

if toDisplay:
    showInNetron(f"./onnx/{lenets_names[net_n]}_folded.onnx")

Stopping http://0.0.0.0:8081
Serving './onnx/lenet_w3a4_folded.onnx' at http://0.0.0.0:8081


## Hardware Build and Deployment

### Hardware Build

In [None]:
from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild

model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_folded.onnx")
model = model.transform(ZynqBuild(platform = "ZCU102", period_ns = 10))
model.save(f"./onnx/{lenets_names[net_n]}_hw.onnx")



### Hardware Deployment

In [None]:
import os
from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ

ip = os.getenv("PYNQ_IP", "128.131.80.208")
username = os.getenv("PYNQ_USERNAME", "xilinx")
password = os.getenv("PYNQ_PASSWORD", "xilinx")
port = os.getenv("PYNQ_PORT", 22)
target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/zcu102")
options = "-o PreferredAuthentications=publickey -o PasswordAuthentication=no"

model = ModelWrapper(f"./onnx/{lenets_names[net_n]}_hw.onnx")
model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
model.save(f"./onnx/{lenets_names[net_n]}_pynq.onnx")