In [None]:
from pynq_dpu import DpuOverlay
overlay = DpuOverlay("dpu.bit")

In [2]:
import os
import time
import numpy as np
import cv2
import matplotlib.pyplot as plt
import threading
%matplotlib inline

In [3]:
import pynq

rails = pynq.get_rails()
recorder = pynq.DataRecorder(rails['12V'].power)

In [4]:
def prepare_prediction(pred):
    mask = np.argmax(pred, axis=-1)
    mask = mask.astype('uint8')
    return mask

In [5]:
overlay.load_model("NERONE4FPGA.xmodel")

In [None]:
image_folder = 'images'
image_format = "npy" # change the format of your image here
original_images = [i for i in os.listdir(image_folder) if i.endswith(image_format)]
total_images = len(original_images)
print(total_images)

In [7]:
dpu = overlay.runner

inputTensors = dpu.get_input_tensors()
outputTensors = dpu.get_output_tensors()

shapeIn = tuple(inputTensors[0].dims)
shapeOut = tuple(outputTensors[0].dims)
outputSize = int(outputTensors[0].get_data_size() / shapeIn[0])

In [8]:
output_data = [np.empty(shapeOut, dtype=np.float32, order="C")]
input_data = [np.empty(shapeIn, dtype=np.float32, order="C")]
image = input_data[0]

In [9]:
def run(image_index, save = False):
    # preprocessing made during training on gpu. Must be repeated here. Remove or modify as needed
    preprocessed = np.load("images/" + original_images[image_index])
    image[0,...] = preprocessed.reshape(shapeIn[1:])

    job_id = dpu.execute_async(input_data, output_data)
    dpu.wait(job_id)
    temp = [j.reshape(1, outputSize) for j in output_data] # do not remove
    if save:
        pred = output_data[0]
        pred = prepare_prediction(pred)
        np.save('predictions/'+original_images[image_index], pred)

In [None]:
index = 50
run(index)
f, axarr = plt.subplots(1, 3, figsize=(16,12))

input_slice = np.load("images/" + original_images[index])
lbl_slice = np.load("labels/"+ original_images[index])
prediction = prepare_prediction(output_data[0])

axarr[0].imshow(input_slice[:, :], cmap = 'gray')
axarr[1].imshow(prediction[0,:,:], cmap = 'gray')
axarr[2].imshow(lbl_slice, cmap = 'gray')

In [None]:
def single_thread(start, end):
    [run(i) for i in range(start, end)]

In [None]:
def measure_performances(threads=2):

    '''run threads '''
    print('Building',threads,'threads...')
    threadAll = []
    start=0
    for i in range(threads):
        if (i==threads-1):
            end = len(original_images)
        else:
            end = start+(len(original_images)//threads)
        t1 = threading.Thread(target=single_thread, args=(start, end))
        threadAll.append(t1)
        #  print(start, end)
        start=end

    print('Starting',threads,'threads...')
    with recorder.record(0.5):
        time1 = time.time()
        for x in threadAll:
            x.start()
        for x in threadAll:
            x.join()
        time2 = time.time()
        timetotal = time2 - time1

    fps = float(len(original_images) / timetotal)
    print("Throughput=%.2f fps, total frames = %.0f, time=%.4f seconds" %(fps, len(original_images), timetotal))
    return fps, len(original_images), timetotal, np.mean(recorder.frame['12V_power'])

In [None]:
fpss = []
powers = []

for i in range(5):
    fps, _, _, power = measure_performances(threads=4)
    fpss.append(fps)
    powers.append(power)

print(fpss)
print(powers)
print("Throughput: %.2f +- %.2f with power consumption of %.2f +- %.2f" %(np.mean(fpss), np.std(fpss), np.mean(powers), np.std(powers)))

In [None]:
del overlay
del dpu