In [1]:
from pynq_dpu import DpuOverlay
overlay = DpuOverlay("dpu.bit")

In [2]:
import os
import time
import numpy as np
import cv2
import matplotlib.pyplot as plt
import threading
%matplotlib inline

In [3]:
import pynq

rails = pynq.get_rails()
recorder = pynq.DataRecorder(rails['12V'].power)

In [4]:
overlay.load_model("NERONE4FPGA.xmodel")

In [None]:
image_folder = 'images'
image_format = "jpeg" # change the format of your image here
original_images = [i for i in os.listdir(image_folder) if i.endswith(image_format)]
total_images = len(original_images)
print(total_images)

In [6]:
dpu = overlay.runner

inputTensors = dpu.get_input_tensors()
outputTensors = dpu.get_output_tensors()

shapeIn = tuple(inputTensors[0].dims)
shapeOut = tuple(outputTensors[0].dims)
outputSize = int(outputTensors[0].get_data_size() / shapeIn[0])

In [7]:
output_data = [np.empty(shapeOut, dtype=np.float32, order="C")]
input_data = [np.empty(shapeIn, dtype=np.float32, order="C")]
image = input_data[0]

In [8]:
def run(image_index):
    # preprocessing made during training on gpu. Must be repeated here. Remove or modify as needed
    preprocessed = cv2.cvtColor(cv2.imread("images/" + original_images[image_index]), cv2.COLOR_BGR2RGB)
    preprocessed = preprocessed * (1/255.0)
    preprocessed = cv2.resize(preprocessed, (shapeIn[1], shapeIn[2]))
    image[0,...] = preprocessed.reshape(shapeIn[1:])
    
    job_id = dpu.execute_async(input_data, output_data)
    dpu.wait(job_id)
    temp = [j.reshape(1, outputSize) for j in output_data] # do not remove

In [None]:
index = 0
run(index)
print(original_images[index])

In [10]:
prediction = output_data[0]
prediction

(1, 4)

In [11]:
label = np.argmax(prediction)
label

array([[-2.375,  1.75 ,  3.5  , -2.625]], dtype=float32)

In [13]:
def single_thread(start, end):
    [run(i) for i in range(start, end)]

In [14]:
def measure_performances(threads=2):
    threads = 2

    '''run threads '''
    print('Building',threads,'threads...')
    threadAll = []
    start=0
    for i in range(threads):
        if (i==threads-1):
            end = len(original_images)
        else:
            end = start+(len(original_images)//threads)
        t1 = threading.Thread(target=single_thread, args=(start, end))
        threadAll.append(t1)
        #  print(start, end)
        start=end

    print('Starting',threads,'threads...')
    with recorder.record(0.5):
        time1 = time.time()
        for x in threadAll:
            x.start()
        for x in threadAll:
            x.join()
        time2 = time.time()
        timetotal = time2 - time1


    fps = float(len(original_images) / timetotal)
    print("Throughput=%.2f fps, total frames = %.0f, time=%.4f seconds" %(fps, len(original_images), timetotal))
    return fps, len(original_images), timetotal, np.mean(recorder.frame['12V_power'])

In [15]:
fpss = []
powers = []
# original_images *= 10 # uncomment to evaluate 780 images instead of 78, just for precision purposes
for i in range(5):
    fps, _, _, power = measure_performances(threads=2)
    fpss.append(fps)
    powers.append(power)

print(fpss)
print(powers)
print("Throughput: %.2f +- %.2f with power consumption of %.2f +- %.2f" %(np.mean(fpss), np.std(fpss), np.mean(powers), np.std(powers)))

Building 2 threads...
Starting 2 threads...
Throughput=225.20 fps, total frames = 1198, time=5.3197 seconds
Building 2 threads...
Starting 2 threads...
Throughput=225.40 fps, total frames = 1198, time=5.3151 seconds
Building 2 threads...
Starting 2 threads...
Throughput=224.55 fps, total frames = 1198, time=5.3351 seconds
Building 2 threads...
Starting 2 threads...
Throughput=225.72 fps, total frames = 1198, time=5.3074 seconds
Building 2 threads...
Starting 2 threads...
Throughput=226.57 fps, total frames = 1198, time=5.2876 seconds
[225.20165822134038, 225.3971705092727, 224.55094143767894, 225.72045731159767, 226.56715327943846]
[17.372454545454545, 17.34459090909091, 17.34557575757576, 17.338636363636365, 17.344727272727273]
Throughput: 225.49 +- 0.66 with power consumption of 17.35 +- 0.01


In [None]:
del overlay
del dpu