# Lab5 B: Generate your QNN to hardware (optional)
note: This part can only be executed in the docker mentioned in Lab5 A, hence it is a optional part.


In Lab5 A, we trained a quantised network, and in this lab we will compile it to a hardware format to deploy it on our FPGAs.

In [2]:
from torch.utils.data import DataLoader, Dataset
import torch
from torch.utils.data import DataLoader, Dataset
from copy import deepcopy
import numpy as np
import os
#
from brevitas.export import export_qonnx
from qonnx.util.cleanup import cleanup as qonnx_cleanup
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.core.datatype import DataType
from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
import shutil
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
#
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Target device: " + str(device))

Target device: cpu


In [3]:
model_file = "lab_new/exports/kws_mlp_w3a3_qonnx.onnx"
estimates_output_dir = "output/google_speech/output_estimates_only"
final_output_dir = "output/google_speech/output_final"

In [4]:
# 1000 ,100000, 160000
cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 160000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/output_estimates_only
Build log is at output/google_speech/output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/10]
Running step: step_tidy_up [2/10]
Running step: step_streamline [3/10]
Running step: step_convert_to_hw [4/10]
Running step: step_create_dataflow_partition [5/10]
Running step: step_specialize_layers [6/10]
Running step: step_target_fps_parallelization [7/10]
Running step: step_apply_folding_config [8/10]
Running step: step_minimize_bit_width [9/10]
Running step: step_generate_estimate_reports [10/10]
Completed successfully


0

In [5]:
cfg_deployment = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000,
    synth_clk_period_ns = 10.0,
    board               = "Pynq-Z2",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)
# build.build_dataflow_cfg(model_file, cfg_deployment)

In [6]:
# 1000 ,100000, 160000
import time
import datetime
c_time = time.time()
print("3 bit, target fps 1000")
print(f"starting time: {datetime.datetime.fromtimestamp(c_time)}")
model_file = "lab_new/exports/kws_mlp_w3a3_qonnx.onnx"
final_output_dir = "output/google_speech/b3_f1k"
target_fps = 1000
cfg_deployment = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = target_fps,
    synth_clk_period_ns = 10.0,
    board               = "Pynq-Z2",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)
build.build_dataflow_cfg(model_file, cfg_deployment)
print(f"finished time: {time.time()}, total time: {time.time() - c_time} seconds")

3 bit, target fps 1000
starting time: 2025-11-07 12:47:16.537099
Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b3_f1k
Build log is at output/google_speech/b3_f1k/build_dataflow.log
Running step: step_qonnx_to_finn [1/19]
Running step: step_tidy_up [2/19]
Running step: step_streamline [3/19]
Running step: step_convert_to_hw [4/19]
Running step: step_create_dataflow_partition [5/19]
Running step: step_specialize_layers [6/19]
Running step: step_target_fps_parallelization [7/19]
Running step: step_apply_folding_config [8/19]
Running step: step_minimize_bit_width [9/19]
Running step: step_generate_estimate_reports [10/19]
Running step: step_hw_codegen [11/19]
Running step: step_hw_ipgen [12/19]
Running step: step_set_fifo_depths [13/19]
Running step: step_create_stitched_ip [14/19]
Running step: step_measure_rtlsim_performance [15/19]
Runnin

In [7]:
c_time = time.time()
print("3 bit, target fps 100000")
model_file = "lab_new/exports/kws_mlp_w3a3_qonnx.onnx"
final_output_dir = "output/google_speech/b3_f100k"
target_fps = 100000
cfg_deployment = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = target_fps,
    synth_clk_period_ns = 10.0,
    board               = "Pynq-Z2",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)
build.build_dataflow_cfg(model_file, cfg_deployment)
print(f"finished time: {time.time()}, total time: {time.time() - c_time} seconds")


3 bit, target fps 100000
Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b3_f100k
Build log is at output/google_speech/b3_f100k/build_dataflow.log
Running step: step_qonnx_to_finn [1/19]
Running step: step_tidy_up [2/19]
Running step: step_streamline [3/19]
Running step: step_convert_to_hw [4/19]
Running step: step_create_dataflow_partition [5/19]
Running step: step_specialize_layers [6/19]
Running step: step_target_fps_parallelization [7/19]
Running step: step_apply_folding_config [8/19]
Running step: step_minimize_bit_width [9/19]
Running step: step_generate_estimate_reports [10/19]
Running step: step_hw_codegen [11/19]
Running step: step_hw_ipgen [12/19]
Running step: step_set_fifo_depths [13/19]
Running step: step_create_stitched_ip [14/19]
Running step: step_measure_rtlsim_performance [15/19]
Running step: step_out_of_context_synthesi

In [8]:
c_time = time.time()
print("3 bit, target fps 160000")
model_file = "lab_new/exports/kws_mlp_w3a3_qonnx.onnx"
final_output_dir = "output/google_speech/b3_f160k"
target_fps = 160000
cfg_deployment = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = target_fps,
    synth_clk_period_ns = 10.0,
    board               = "Pynq-Z2",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)
build.build_dataflow_cfg(model_file, cfg_deployment)
print(f"finished time: {time.time()}, total time: {time.time() - c_time} seconds")

3 bit, target fps 160000
Building dataflow accelerator from lab_new/exports/kws_mlp_w3a3_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b3_f160k
Build log is at output/google_speech/b3_f160k/build_dataflow.log
Running step: step_qonnx_to_finn [1/19]
Running step: step_tidy_up [2/19]
Running step: step_streamline [3/19]
Running step: step_convert_to_hw [4/19]
Running step: step_create_dataflow_partition [5/19]
Running step: step_specialize_layers [6/19]
Running step: step_target_fps_parallelization [7/19]
Running step: step_apply_folding_config [8/19]
Running step: step_minimize_bit_width [9/19]
Running step: step_generate_estimate_reports [10/19]
Running step: step_hw_codegen [11/19]
Running step: step_hw_ipgen [12/19]
Running step: step_set_fifo_depths [13/19]
Running step: step_create_stitched_ip [14/19]
Running step: step_measure_rtlsim_performance [15/19]
Running step: step_out_of_context_synthesi

In [9]:
c_time = time.time()
print("4 bit, target fps 100000")
model_file = "lab_new/exports/kws_mlp_w4a4_qonnx.onnx"
final_output_dir = "output/google_speech/b4_f100k"
target_fps = 100000
cfg_deployment = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = target_fps,
    synth_clk_period_ns = 10.0,
    board               = "Pynq-Z2",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)
build.build_dataflow_cfg(model_file, cfg_deployment)
print(f"finished time: {time.time()}, total time: {time.time() - c_time} seconds")

4 bit, target fps 100000
Building dataflow accelerator from lab_new/exports/kws_mlp_w4a4_qonnx.onnx
Intermediate outputs will be generated in /tmp/finn_dev_changhong
Final outputs will be generated in output/google_speech/b4_f100k
Build log is at output/google_speech/b4_f100k/build_dataflow.log
Running step: step_qonnx_to_finn [1/19]
Running step: step_tidy_up [2/19]
Running step: step_streamline [3/19]
Running step: step_convert_to_hw [4/19]
Running step: step_create_dataflow_partition [5/19]
Running step: step_specialize_layers [6/19]
Running step: step_target_fps_parallelization [7/19]
Running step: step_apply_folding_config [8/19]
Running step: step_minimize_bit_width [9/19]
Running step: step_generate_estimate_reports [10/19]
Running step: step_hw_codegen [11/19]
Running step: step_hw_ipgen [12/19]
Running step: step_set_fifo_depths [13/19]
Running step: step_create_stitched_ip [14/19]
Running step: step_measure_rtlsim_performance [15/19]
Running step: step_out_of_context_synthesi

## Try other parameters and answer (optional):
- Q1: What performance can you obtain from the estimation report and final generation report?
- Q2: Explain what happened to each intermediate model / step.
- Q3: Try different setup with estimation code, if you change target_fps, what will happen?
- Q4: What is the performance bottleneck and pipeline balance in a dataflow model?
- Q5: Explain SIMD and PE.
- Q6: What is the best performance you could reach with PYNQ-Z2 board?

In [10]:
cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)
# build.build_dataflow_cfg(model_file, cfg_estimates)