# Lab4 B: Generate your QNN to hardware (optional)
note: This part can only be executed in the docker mentioned in Lab4 A, hence it is a optional part.


In Lab4 A, we trained a quantised network, and in this lab we will compile it to a hardware format to deploy it on our FPGAs.

In [22]:
from torch.utils.data import DataLoader, Dataset
import torch
from torch.utils.data import DataLoader, Dataset
from copy import deepcopy
import numpy as np
import os
#
from brevitas.export import export_qonnx
from qonnx.util.cleanup import cleanup as qonnx_cleanup
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.core.datatype import DataType
from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
import shutil
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import onnx
print(onnx.__file__)
print(onnx.__version__)
#
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Target device: " + str(device))

/usr/local/lib/python3.10/dist-packages/onnx/__init__.py
1.17.0
Target device: cpu


In [None]:
model_file = "lab_new/exports/kws_mlp_w3a3_qonnx.onnx"
estimates_output_dir = "lab_new/output/google_speech/output_estimates_only"
final_output_dir = "lab_new/output/google_speech/output_final"

In [24]:
from finn.builder.build_dataflow_config import DataflowBuildConfig
from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
from qonnx.transformation.insert_topk import InsertTopK
# Inject the preprocessing step into FINN to enable json serialization later on
def step_preprocess(model: ModelWrapper, cfg: DataflowBuildConfig):
    model = model.transform(InsertTopK(k=1))
    return model


build_dataflow_step_lookup["step_preprocess_InsertTopK"] = step_preprocess
estimate_steps = ["step_preprocess_InsertTopK"] + build_cfg.estimate_only_dataflow_steps
build_steps = ["step_preprocess_InsertTopK"] + build_cfg.default_build_dataflow_steps

In [25]:
# 1000 ,100000, 160000
cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 160000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = estimate_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/output_estimates_only
Build log is at output/google_speech/output_estimates_only/build_dataflow.log
Running step: step_preprocess [1/11]
Running step: step_qonnx_to_finn [2/11]
Running step: step_tidy_up [3/11]
Running step: step_streamline [4/11]
Running step: step_convert_to_hw [5/11]
Running step: step_create_dataflow_partition [6/11]
Running step: step_specialize_layers [7/11]
Running step: step_target_fps_parallelization [8/11]
Running step: step_apply_folding_config [9/11]
Running step: step_minimize_bit_width [10/11]
Running step: step_generate_estimate_reports [11/11]
Completed successfully


0

In [None]:
import time
import datetime
def build_accelerator(bw, target_fps):
    c_time = time.time()
    print(f"{bw} bit, target fps {target_fps}")
    print(f"starting time: {datetime.datetime.fromtimestamp(c_time)}")
    model_file = f"lab_new/exports/kws_mlp_w{bw}a{bw}_qonnx.onnx"
    final_output_dir = f"lab_new/output/google_speech/b{bw}_f{target_fps}"
    cfg_deployment = build.DataflowBuildConfig(
        output_dir          = final_output_dir,
        mvau_wwidth_max     = 80,
        target_fps          = target_fps,
        synth_clk_period_ns = 10.0,
        board               = "Pynq-Z2",
        steps               = build_steps,
        shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
        generate_outputs=[
            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
            build_cfg.DataflowOutputType.STITCHED_IP,
            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
            build_cfg.DataflowOutputType.OOC_SYNTH,
            build_cfg.DataflowOutputType.BITFILE,
            build_cfg.DataflowOutputType.PYNQ_DRIVER,
            build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
        ]
    )
    build.build_dataflow_cfg(model_file, cfg_deployment)
    print(f"finished time: {time.time()}, total time: {time.time() - c_time} seconds")

In [27]:
build_accelerator(3, 1000)

3 bit, target fps 1000
starting time: 2025-11-07 21:51:36.762570
Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b3_f1000
Build log is at output/google_speech/b3_f1000/build_dataflow.log
Running step: step_preprocess [1/20]
Running step: step_qonnx_to_finn [2/20]
Running step: step_tidy_up [3/20]
Running step: step_streamline [4/20]
Running step: step_convert_to_hw [5/20]
Running step: step_create_dataflow_partition [6/20]
Running step: step_specialize_layers [7/20]
Running step: step_target_fps_parallelization [8/20]
Running step: step_apply_folding_config [9/20]
Running step: step_minimize_bit_width [10/20]
Running step: step_generate_estimate_reports [11/20]
Running step: step_hw_codegen [12/20]
Running step: step_hw_ipgen [13/20]
Running step: step_set_fifo_depths [14/20]
Running step: step_create_stitched_ip [15/20]
Running step: step

In [28]:
build_accelerator(3, 100000)

3 bit, target fps 100000
starting time: 2025-11-07 22:10:27.550869
Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b3_f100000
Build log is at output/google_speech/b3_f100000/build_dataflow.log
Running step: step_preprocess [1/20]
Running step: step_qonnx_to_finn [2/20]
Running step: step_tidy_up [3/20]
Running step: step_streamline [4/20]
Running step: step_convert_to_hw [5/20]
Running step: step_create_dataflow_partition [6/20]
Running step: step_specialize_layers [7/20]
Running step: step_target_fps_parallelization [8/20]
Running step: step_apply_folding_config [9/20]
Running step: step_minimize_bit_width [10/20]
Running step: step_generate_estimate_reports [11/20]
Running step: step_hw_codegen [12/20]
Running step: step_hw_ipgen [13/20]
Running step: step_set_fifo_depths [14/20]
Running step: step_create_stitched_ip [15/20]
Running step

In [29]:
build_accelerator(3, 160000)

3 bit, target fps 160000
starting time: 2025-11-07 22:31:58.637643
Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b3_f160000
Build log is at output/google_speech/b3_f160000/build_dataflow.log
Running step: step_preprocess [1/20]
Running step: step_qonnx_to_finn [2/20]
Running step: step_tidy_up [3/20]
Running step: step_streamline [4/20]
Running step: step_convert_to_hw [5/20]
Running step: step_create_dataflow_partition [6/20]
Running step: step_specialize_layers [7/20]
Running step: step_target_fps_parallelization [8/20]
Running step: step_apply_folding_config [9/20]
Running step: step_minimize_bit_width [10/20]
Running step: step_generate_estimate_reports [11/20]
Running step: step_hw_codegen [12/20]
Running step: step_hw_ipgen [13/20]
Running step: step_set_fifo_depths [14/20]
Running step: step_create_stitched_ip [15/20]
Running step

In [30]:
build_accelerator(4, 100000)

4 bit, target fps 100000
starting time: 2025-11-07 22:52:40.835731
Building dataflow accelerator from lab_new/exports/kws_mlp_w4a4_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b4_f100000
Build log is at output/google_speech/b4_f100000/build_dataflow.log
Running step: step_preprocess [1/20]
Running step: step_qonnx_to_finn [2/20]
Running step: step_tidy_up [3/20]
Running step: step_streamline [4/20]
Running step: step_convert_to_hw [5/20]
Running step: step_create_dataflow_partition [6/20]
Running step: step_specialize_layers [7/20]
Running step: step_target_fps_parallelization [8/20]
Running step: step_apply_folding_config [9/20]
Running step: step_minimize_bit_width [10/20]
Running step: step_generate_estimate_reports [11/20]
Running step: step_hw_codegen [12/20]
Running step: step_hw_ipgen [13/20]
Running step: step_set_fifo_depths [14/20]
Running step: step_create_stitched_ip [15/20]
Running step

## Try other parameters and answer (optional):
- Q1: What performance can you obtain from the estimation report and final generation report?
- Q2: Explain what happened to each intermediate model / step.
- Q3: Try different setup with estimation code, if you change target_fps, what will happen?
- Q4: What is the performance bottleneck and pipeline balance in a dataflow model?
- Q5: Explain SIMD and PE.
- Q6: What is the best performance you could reach with PYNQ-Z2 board?

In [31]:
cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)
# build.build_dataflow_cfg(model_file, cfg_estimates)