In [9]:
from finn.util.visualization import showSrc, showInNetron
from finn.util.basic import make_build_dir

build_dir = "/workspace/finn/notebooks/fpga/mlp"

In [10]:
import onnx
import brevitas.onnx as bo
from finn.core.modelwrapper import ModelWrapper

model_name = build_dir + "/mlp.onnx"

model = ModelWrapper(model_name)
#showInNetron(model_name)

In [11]:
from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from finn.transformation.infer_shapes import InferShapes
from finn.transformation.infer_datatypes import InferDataTypes
from finn.transformation.fold_constants import FoldConstants


model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())

model.save(build_dir+"/mlp_tidy.onnx")

In [12]:
#showInNetron(build_dir+"/mlp_tidy.onnx")

In [13]:
# Pre and post processing to convert values to floating point

from finn.util.pytorch import ToTensor
from finn.transformation.merge_onnx_models import MergeONNXModels
from finn.core.datatype import DataType

model = ModelWrapper(build_dir+"/mlp_tidy.onnx")
global_inp_name = model.graph.input[0].name
ishape = model.get_tensor_shape(global_inp_name)

# preprocessing: torchvision's ToTensor divides uint8 inputs by 255
totensor_pyt = ToTensor()
chkpt_preproc_name = build_dir+"/mlp_preproc.onnx"
bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)

# join preprocessing and core model
pre_model = ModelWrapper(chkpt_preproc_name)
model = model.transform(MergeONNXModels(pre_model))
# add input quantization annotation: UINT8 for all BNN-PYNQ models
global_inp_name = model.graph.input[0].name
model.set_tensor_datatype(global_inp_name, DataType.UINT8)

model.save(build_dir+"/mlp_with_preproc.onnx")
#showInNetron(build_dir+"/mlp_with_preproc.onnx")



You can observe two changes in the graph above: a `Div` node has appeared in the beginning to perform the input preprocessing, and the `global_in` tensor now has a quantization annotation to mark it as an unsigned 8-bit value.

For the postprocessing we'll insert a TopK node for k=1 at the end of our graph. This will extract the index (class number) for the largest-valued output.

In [14]:
from finn.transformation.insert_topk import InsertTopK

# postprocessing: insert Top-1 node at the end
model = model.transform(InsertTopK(k=1))
chkpt_name = build_dir+"/mlp_pre_post.onnx"
# tidy-up again
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())
model.save(chkpt_name)

#showInNetron(build_dir+"/mlp_pre_post.onnx")

In [15]:
# Streamlining to eliminate floating point operations

from finn.transformation.streamline import Streamline
from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
import finn.transformation.streamline.absorb as absorb

#showSrc(Streamline)

model = ModelWrapper(build_dir+"/mlp_pre_post.onnx")
model = model.transform(MoveScalarLinearPastInvariants())
model = model.transform(Streamline())
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())

model.save(build_dir+"/mlp_streamlined.onnx")
#showInNetron(build_dir+"/mlp_streamlined.onnx")

NameError: name 'InferDataLayouts' is not defined

In [None]:
#from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
from finn.transformation.infer_data_layouts import InferDataLayouts
from finn.transformation.general import RemoveUnusedTensors
import finn.transformation.streamline.reorder as reorder
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants

model = model.transform(reorder.MoveScalarLinearPastInvariants())
model = model.transform(reorder.MoveScalarAddPastMatMul())
model = model.transform(reorder.MoveScalarMulPastMatMul())
#model = model.transform(ConvertBipolarMatMulToXnorPopcount()) this is for binary
model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
model = model.transform(absorb.AbsorbMulIntoMultiThreshold())

# absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
#model = model.transform(RoundAndClipThresholds())

# bit of tidy-up
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())

model.save(build_dir+"/mlp_ready_for_hls_conversion.onnx")
#showInNetron(build_dir+"/mlp_ready_for_hls_conversion.onnx")

In [None]:
# Convert to HLS (quantized instead of binary)

import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
model = ModelWrapper(build_dir+"/mlp_ready_for_hls_conversion.onnx")
model = model.transform(to_hls.InferAddStreamsLayer())
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferQuantizedStreamingFCLayer("const")) #is quantized and not binary
model = model.transform(to_hls.InferChannelwiseLinearLayer())

# TopK to LabelSelect
#model = model.transform(to_hls.InferLabelSelectLayer())
model.save(build_dir+"/mlp_hls_layers.onnx")
#showInNetron(build_dir+"/mlp_hls_layers.onnx")

In [16]:
# Dataflow partition to separate HLS layers into another model and replace with a placeholder layer (streamingdataflowpartition)

from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition

model = ModelWrapper(build_dir+"/mlp_hls_layers.onnx")
parent_model = model.transform(CreateDataflowPartition())
parent_model.save(build_dir+"/mlp_dataflow_parent.onnx")
#showInNetron(build_dir+"/mlp_dataflow_parent.onnx")

In [17]:
from finn.custom_op.registry import getCustomOp
sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
#showInNetron(dataflow_model_filename)

In [18]:
model = ModelWrapper(dataflow_model_filename)

In [19]:
fc0 = model.graph.node[0]
fc0w = getCustomOp(fc0)

#print("CustomOp wrapper is of class " + fc0w.__class__.__name__)

#fc0w.get_nodeattr_types()

In [20]:
fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")

# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer

config = [
    (16, 8, 168, 256, "auto"),    
#    (16, 8, 256, 128, "auto"),
#    (16, 8, 128, 64, "auto"),
    (16, 8, 64, 16, "auto"),
]

for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepth", ififo)
    fcl_inst.set_nodeattr("outFIFODepth", ofifo)
    fcl_inst.set_nodeattr("ram_style", ramstyle)

In [21]:
model.save(build_dir+"/mlp_folding_factors.onnx")
#showInNetron(build_dir+"/mlp_folding_factors.onnx")

In [22]:
# Hardware build

from finn.util.basic import pynq_part_map
pynq_board = "Ultra96"
fpga_part = pynq_part_map[pynq_board]
target_clk_ns = 10

In [23]:
from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper(build_dir+"/mlp_folding_factors.onnx")
model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))

AssertionError: Fold depth must be integer

In [28]:
model.save(build_dir + "/mlp_post_synthesis.onnx")

### Examining the generated outputs <a id='gen_outputs'></a>

Let's start by viewing the post-synthesis model in Netron:

In [None]:
showInNetron(build_dir + "/mlp_post_synthesis.onnx")

We can see that our sequence of HLS layers has been replaced with `StreamingDataflowPartition`s, each of which point to a different ONNX file. You can open a Netron session for each of them to view their contents. Here, the first and last partitions contain only an `IODMA` node, which was inserted automatically to move data between DRAM and the accelerator. Let's take a closer look at the middle partition, which contains all our layers:

In [None]:
model = ModelWrapper(build_dir + "/tfc_w1_a1_post_synthesis.onnx")
sdp_node_middle = getCustomOp(model.graph.node[1])
postsynth_layers = sdp_node_middle.get_nodeattr("model")

showInNetron(postsynth_layers)

We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:

In [None]:
model = ModelWrapper(postsynth_layers)
model.model.metadata_props

Here we see that a Vivado project was built to create what we call the `stitched IP`, where all the IP blocks implementing various layers will be stitched together. You can view this stitched block design in Vivado, or [here](StreamingDataflowPartition_1.pdf) as an exported PDF.

Moving back to the top-level model, recall that `ZynqBuild` will create a Vivado project and synthesize it, so it will be creating metadata entries related to the paths and files that were created:

In [None]:
model = ModelWrapper(build_dir + "/tfc_w1_a1_post_synthesis.onnx")
model.model.metadata_props

Here, we can see the directories that were created for the PYNQ driver (`pynq_driver_dir`) and the Vivado synthesis project (`vivado_pynq_proj`), as well as the locations of the bitfile, hardware handoff file and synthesis report.

In [None]:
! ls {model.get_metadata_prop("vivado_pynq_proj")}

Feel free to examine the generated Vivado project to get a feel for how the system-level integration is performed for the  FINN-generated "stitched IP", which appears as `StreamingDataflowPartition_1` in the top-level block design -- you can see it as a block diagram exported to PDF [here](top.pdf).


## 4.  PYNQ deployment <a id='hw_test'></a>

* [Deployment and Remote Execution](#deploy)
* [Validation on PYNQ Board](#validation)
* [Throughput Test on PYNQ Board](#throughput)


We are almost done preparing our hardware design. We'll now put it in a form suitable for use as a PYNQ overlay, synthesize and deploy it.

### Deployment and Remote Execution <a id='deploy'></a>

We'll now use the `DeployToPYNQ` transformation to create a deployment folder with the bitfile and driver file(s), and copy that to the PYNQ board. You can change the default IP address, username, password and target folder for the PYNQ below.

In [None]:
from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
ip = "192.168.2.99"
port = "22"
username = "xilinx"
password = "xilinx"
target_dir = "/home/xilinx/finn_tfc_end2end_example"
model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
model.save(build_dir + "/tfc_w1_a1_pynq_deploy.onnx")

Let's verify that the remote access credentials is saved in the model metadata, and that the deployment folder has been successfully copied to the board:

In [None]:
model.model.metadata_props

In [None]:
target_dir_pynq = target_dir + "/" + model.get_metadata_prop("pynq_deployment_dir").split("/")[-1]
target_dir_pynq

In [None]:
! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'

We only have two more steps to be able to remotely execute the deployed bitfile with some test data from the MNIST dataset. Let's load up some test data that comes bundled with FINN.

In [None]:
from pkgutil import get_data
import onnx.numpy_helper as nph
import matplotlib.pyplot as plt

raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
x = nph.to_array(onnx.load_tensor_from_string(raw_i))
plt.imshow(x.reshape(28,28), cmap='gray')

In [None]:
model = ModelWrapper(build_dir + "/tfc_w1_a1_pynq_deploy.onnx")
iname = model.graph.input[0].name
oname = parent_model.graph.output[0].name
ishape = model.get_tensor_shape(iname)
print("Expected network input shape is " + str(ishape))

Finally, we can call `execute_onnx` on the graph, which will internally call remote execution with the bitfile, grab the results and return a numpy array. You may recall that one "reshape" node was left out of the StreamingDataflowPartition. We'll do that manually with a numpy function call when passing in the input, but everything else in the network ended up inside the StreamingDataflowPartition so that's all we need to do.

In [None]:
import numpy as np
from finn.core.onnx_exec import execute_onnx

input_dict = {iname: x.reshape(ishape)}
ret = execute_onnx(model, input_dict)

In [None]:
ret[oname]

We see that the network correctly predicts this as a digit 2.

### Validating the Accuracy on a PYNQ Board <a id='validation'></a>

All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`sshpass` and `echo password | sudo -S command`) to get that working from this notebook running on the host computer.

**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**

To validate the accuracy, we first need to install the [`dataset-loading`](https://github.com/fbcotter/dataset_loading) Python package to the PYNQ board. This will give us a convenient way of downloading and accessing the MNIST dataset.


Command to execute on PYNQ:

```pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```

In [None]:
! sshpass -p {password} ssh -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'

We can now use the `validate.py` script that was generated together with the driver to measure top-1 accuracy on the MNIST dataset.

Command to execute on PYNQ:

`python3.6 validate.py --dataset mnist --batchsize 1000`

In [None]:
! sshpass -p {password} ssh -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset mnist --batchsize 1000'

We see that the final top-1 accuracy is 92.96%, which is very close to the 93.17% reported on the [BNN-PYNQ accuracy table in Brevitas](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). 

### Throughput Test on PYNQ Board <a id='throughput'></a>
In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done using the core function `throughput_test`. In the next section we import the function and execute it.
First we extract the `remote_exec_model` again and pass it to the function. The function returns the metrics of the network as dictionary. 

In [None]:
from finn.core.throughput_test import throughput_test_remote

model = ModelWrapper(build_dir + "/tfc_w1_a1_pynq_deploy.onnx")
res = throughput_test_remote(model, 10000)
print("Network metrics:")
for key in res:
    print(str(key) + ": " + str(res[key]))

Together with the values for folding we can evaluate the performance of our accelerator. Each layer has a total folding factor of 64 and because the network is fully pipelined, it follows: `II = 64`. II is the initiation interval and indicates how many cycles are needed for one input to be processed. 

In [None]:
II = 64
# frequency in MHz
f_MHz = 100
# expected throughput in MFPS
expected_throughput = f_MHz / II
# measured throughput (FPS) from throughput test, converted to MFPS
measured_throughput = res["throughput[images/s]"] * 0.000001
# peformance
print("We reach approximately " + str(round((measured_throughput / expected_throughput)*100)) + "% of the ideal performance.")

The measured values were recorded with a batch size of 10000 and at a frequency of 100 MHz. We will be improving the efficiency of the generated accelerator examples in the coming FINN releases.