In [28]:
from pynq import Overlay
from pynq import allocate
import numpy as np
overlay = Overlay('mnist.bit')

In [30]:
from pynq import ps

print(ps.Clocks.fclk0_mhz)
ps.Clocks.fclk0_mhz = 375
print(ps.Clocks.fclk0_mhz)
print(ps.Clocks.cpu_mhz)

299.997
374.99625
1333.32




In [3]:
ip = overlay.MultilayerPerceptron_0
mmio = ip.mmio
register_map = ip.register_map
registers = register_map._register_classes

In [4]:
for name, reg in registers.items():
    print(name, reg)

CTRL (<class 'pynq.registers.RegisterCTRL'>, 0, 32, None, None, 'read-write')
GIER (<class 'pynq.registers.RegisterGIER'>, 4, 32, None, None, 'read-write')
IP_IER (<class 'pynq.registers.RegisterIP_IER'>, 8, 32, None, None, 'read-write')
IP_ISR (<class 'pynq.registers.RegisterIP_ISR'>, 12, 32, None, None, 'read-write')
im_1 (<class 'pynq.registers.Registerim_1'>, 16, 32, None, None, 'write-only')
im_2 (<class 'pynq.registers.Registerim_2'>, 20, 32, None, None, 'write-only')
out_r_1 (<class 'pynq.registers.Registerout_r_1'>, 28, 32, None, None, 'write-only')
out_r_2 (<class 'pynq.registers.Registerout_r_2'>, 32, 32, None, None, 'write-only')


In [5]:
# Allocated buffer (m_axi)
input_buffer_size = 7840000
output_buffer_size = 10000

input_buffer = allocate(shape=(input_buffer_size,), dtype=np.int8) 
output_buffer = allocate(shape=(output_buffer_size,), dtype=np.int8) 
register_map.im_1.im = input_buffer.device_address
register_map.out_r_1.out_r = output_buffer.device_address


In [6]:
import glob

bias = []
weight = []
scales = [512, 256, 256]
layers = [784,128,256,10]

x_test = (np.load('x_test.npy')//32).astype(np.int8)
y_test = np.load('y_test.npy')

for filename in sorted(glob.glob("dump_results/dump_results_weights/quant_dense_*_bias.txt")):
# for filename in sorted(glob.glob("dump_results/dump_results_weights/quant_dense_*_bias_float.txt")):
    bias.append(np.loadtxt(filename))
for filename in sorted(glob.glob("dump_results/dump_results_weights/quant_dense_*_kernel.txt")):
# for filename in sorted(glob.glob("dump_results/dump_results_weights/quant_dense_*_kernel_float.txt")):
    weight.append(np.loadtxt(filename))

for i in range(3):
   weight[i] = weight[i].reshape(layers[i],layers[i+1])

In [7]:
# Hardware accelerated function
def mnist(im):
    # Write to input buffer
    input_buffer[:len(im)] = im
    # Send start signal
    register_map.CTRL.AP_START = 1
    
    # Wait until algorithm has completed
    while (register_map.CTRL.AP_DONE == 0):
        pass
    
    return output_buffer

In [8]:
mnist(x_test.flatten())

PynqBuffer([7, 2, 1, ..., 4, 5, 6], dtype=int8)

In [9]:
def run(im):
    result = []
    for i in range(10000):
        data = im[i]
        for j in range(3):
            data = (data@weight[j]+bias[j])//scales[j]
            if j != 2:
                data = data*(data>0)
        result.append(np.argmax(data))
    return result

In [31]:
res_hls = []
res_py = []
err_hls = 0
err_py = 0

res_hls = mnist(x_test.flatten())
res_py = run(x_test)
    
for i in range(10000):            
    if res_hls[i] != y_test[i]:
        err_hls +=1

    if res_py[i] != y_test[i]:
        err_py +=1
        
print("acc hls {}".format(1-err_hls/10000))                 
print("acc py {}".format(1-err_py/10000))

acc hls 0.9764
acc py 0.9764


In [33]:
hw_time = %timeit -n 1 -r 10 -o mnist(x_test.flatten())
sw_time = %timeit -n 1 -r 10 -o run(x_test)

print('Performance gain:', sw_time.average / hw_time.average)

43.4 ms ± 193 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)
7.29 s ± 65.3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
Performance gain: 168.18007642462447


In [None]:
print('FPS HW:', 10000/hw_time.average)
print('FPS SW:', 10000/hw_time.average)