# ResNet-50 Inference with FINN on Alveo

This notebook demonstrates the functionality of a FINN-based, full dataflow ResNet-50 implemented in Alveo U250. The characteristics of the network are the following:
 - residual blocks at 1-bit weights, 2/4-bit activations
 - first convolution and last (fully connected) layer use 8-bit weights
 - all parameters stored on-chip in BRAM/LUTRAM/URAM
 - single DDR controller (DDR0) utilized for input and output

We validate the network against ImageNet. We use the PYNQ APIs for retrieving and recording power information which is then displayed in real-time.

## Set up Accelerator with PYNQ
We load the Alveo accelerator and print its memory-mapped registers:

In [None]:
import pynq

ol=pynq.Overlay("resnet50.xclbin")
accelerator=ol.resnet50_1
print(accelerator.register_map)

Next we create a data buffer in the Alveo PLRAM memory to hold the weights of the Fully Connected Layer:

In [None]:
import numpy as np

#allocate a buffer for FC weights, targeting the Alveo PLRAM
fcbuf = pynq.allocate((1000,2048), dtype=np.int8, target=ol.PLRAM0)

Load the weight from a CSV file and push them to the accelerator buffer:

In [None]:
#load Weights from file into the PYNQ buffer
fcweights = np.genfromtxt("fcweights.csv", delimiter=',', dtype=np.int8)
#csv reader erroneously adds one extra element to the end, so remove, then reshape
fcweights = fcweights[:-1].reshape(1000,2048)
fcbuf[:] = fcweights

#Move the data to the Alveo DDR
fcbuf.sync_to_device()

## Single Image Inference
In this example we perform inference on each of the images in a `pictures` folder and display the top predicted class overlaid onto the image. The code assumes the existence of this `pictures` folder, where you should put the images you want to classificate. There is no restriction on the images that you can use.

In [None]:
import shutil
import wget
import os
import glob
from itertools import chain
import cv2
import matplotlib.pyplot as plt

image_list = list(chain.from_iterable([glob.glob('pictures/*.%s' % ext) for ext in ["jpg","gif","png","tga"]]))

#get imagenet classes from file
import pickle
classes = pickle.load(open("labels.pkl",'rb'))

def infer_once(filename):
    inbuf = pynq.allocate((224,224,3), dtype=np.int8, target=ol.bank0)
    outbuf = pynq.allocate((5,), dtype=np.uint32, target=ol.bank0)

    #preprocess image
    img = cv2.resize(cv2.imread(filename), (224,224))

    #transfer to accelerator
    inbuf[:] = img
    inbuf.sync_to_device()
    
    #do inference
    accelerator.call(inbuf, outbuf, fcbuf, 1)

    #get results
    outbuf.sync_from_device()
    results = np.copy(outbuf)
    return results

inf_results = []
for img in image_list:
    inf_output = infer_once(img)
    inf_result = [classes[i] for i in inf_output]
    inf_results.append(inf_result)

plt.figure(figsize=(20,10))
columns = 3
for i, image in enumerate(image_list):
    plt.subplot(len(image_list) / columns + 1, columns, i + 1)
    top_class = inf_results[i][0].split(',', 1)[0]
    display_image = cv2.cvtColor(cv2.resize(cv2.imread(image),(224,224)), cv2.COLOR_BGR2RGB)
    plt.imshow(cv2.putText(display_image, top_class, (10,20), cv2.FONT_HERSHEY_TRIPLEX, 0.7, (255,255,255)))

## Plot Accelerator Board Power with PYNQ
We first set up data acquisition using PYNQ's PMBus API

In [None]:
import plotly
import plotly.graph_objs as go
import pandas as pd
from pynq import pmbus
import time

rails = pmbus.get_xrt_sysfs_rails(pynq.pl_server.Device.active_device)

#We create a recorder monitoring the three rails that have power measurement on Alveo. 
#Total board power is obtained by summing together the PCI Express and Auxilliary 12V rails. 
#While some current is also drawn over the PCIe 5V rail this is negligible compared to the 12V rails and isn't recorded. 
#We also measure the VCC_INT power which is the primary supply to the FPGA.

recorder = pmbus.DataRecorder(rails["12v_aux"].power,
                              rails["12v_pex"].power,
                              rails["vccint"].power)

f = recorder.frame

powers = pd.DataFrame(index=f.index)
powers['board_power'] = f['12v_aux_power'] + f['12v_pex_power']
powers['fpga_power'] = f['vccint_power']

#Now we need to specify the layout for the graph. In this case it will be a simple Line/Scatter plot, 
#autoranging on both axes with the Y axis having 0 at the bottom.
layout = {
    'xaxis': {
        'title': 'Time (s)'
    },
    'yaxis': {
        'title': 'Power (W)',
        'rangemode': 'tozero',
        'autorange': True
    }
}

#Plotly expects data in a specific format, namely an array of plotting objects. 
#This helper function will update the data in a plot based. 
#Th e `DataRecorder` stores the recording in a Pandas dataframe object with a time-based index. 
#This makes it easy to pull out the results for a certain time range and compute a moving average. 
#In this case we are going to give a 5-second moving average of the results as well as the raw input.
def update_data(frame, start, end, plot):
    ranged = frame[start:end]
    average_ranged = frame[start-pd.tseries.offsets.Second(5):end]
    rolling = (average_ranged['12v_aux_power'] + average_ranged['12v_pex_power']).rolling(
        pd.tseries.offsets.Second(5)
    ).mean()[ranged.index]
    powers = pd.DataFrame(index=ranged.index)
    powers['board_power'] = ranged['12v_aux_power'] + ranged['12v_pex_power']
    powers['rolling'] = rolling
    data = [
        go.Scatter(x=powers.index, y=powers['board_power'], name="Board Power"),
        go.Scatter(x=powers.index, y=powers['rolling'], name="5 Second Avg")
    ]
    plot.update(data=data)
    
#Next we create an show the plot object, initially there will be no data to display but this plot will be updated after we start the recording. 
#Once the plot is running it is possible to right click on it to pop out the graph into a separate window.
plot = go.FigureWidget(layout=layout)
plot

Next we create a dynamically-updating power graph:

In [None]:
recorder.record(0.1)

#In order to continue updating the graph we need a thread running in the background. 
#The following thread will call our update function twice a second to display the most recently collected minute of data.
do_update = True

def thread_func():
    while do_update:
        now = pd.Timestamp.fromtimestamp(time.time())
        past = now - pd.tseries.offsets.Second(60)
        update_data(recorder.frame, past, now, plot)
        time.sleep(0.5)

from threading import Thread
t = Thread(target=thread_func)
t.start()

To manually stop the power graph:

In [None]:
do_update = False
recorder.stop()

## Synthetic Throughput Test
We execute inference of a configurable-size batch of images, without data movement. We measure the latency and throughput. 

In [None]:
import ipywidgets as widgets
from IPython.display import clear_output

bs = widgets.IntSlider(
    value=128,
    min=1,
    max=1000,
    step=1,
    description='Batch Size:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
fps = widgets.IntProgress(min=0, max=2500, description='FPS: ')
latency = widgets.FloatProgress(min=0, max=0.1, description='Latency (ms): ')

button = widgets.Button(description='Stop')
stop_running = False

def on_button_clicked(_):
    global stop_running
    stop_running = True
            
# linking button and function together using a button's method
button.on_click(on_button_clicked)

out_fps = widgets.Text()
out_latency = widgets.Text()

ui_top = widgets.HBox([button, bs])
ui_bottom = widgets.HBox([fps, out_fps, latency, out_latency])
ui = widgets.VBox([ui_top, ui_bottom])
display(ui)

import time
import threading

def benchmark_synthetic():
    import pynq
    ibuf = pynq.allocate((1000,3,224,224), dtype=np.int8, target=ol.bank0)
    obuf = pynq.allocate((1000,5), dtype=np.uint32, target=ol.bank0)

    while True:
        if stop_running:
            print("Stopping")
            return
        duration = time.monotonic()
        accelerator.call(ibuf, obuf, fcbuf, bs.value)
        duration = time.monotonic() - duration
        fps.value = int(bs.value/duration)
        latency.value = duration
        out_fps.value = str(fps.value)
        out_latency.value = '%.2f' % (duration * 1000)
        

t = threading.Thread(target=benchmark_synthetic)
t.start()
