# BNN Implementation

## Setup

In [None]:
from pynq import Overlay, allocate
import numpy as np
import struct
from time import perf_counter

time_tracker = {
    'stream_hw': 0,
    'burst_hw': 0,
    'software': 0,
}

In [44]:
IMG_SIZE = 28
IMG_SIZE_SQ = IMG_SIZE**2

with open('train-images.idx3-ubyte', 'rb') as f:
    magic, num, rows, cols = struct.unpack(">IIII", f.read(16))
    images = np.frombuffer(f.read(), dtype=np.uint8).reshape(num, rows, cols)

with open('train-labels.idx1-ubyte', 'rb') as f:
    magic, num = struct.unpack(">II", f.read(8))
    labels = np.frombuffer(f.read(), dtype=np.uint8)

print(images.shape)
print(labels.shape)

image_data = np.reshape(images, (60000, IMG_SIZE_SQ))
# signed = np.where(flattened > 0, 1, -1)
# image_data = np.where(signed == 1, 0, 1).astype(np.int32)

print(image_data.shape)

(60000, 28, 28)
(60000,)
(60000, 784)


## Software Implementation

In [45]:
import numpy as np

sign = np.vectorize(lambda x: 1 if x>0 else -1)
quantize = np.vectorize(lambda x: 0 if x==1 else 1)
adj = np.vectorize(lambda x: x*2-255)

model = np.load("model.npy", allow_pickle=True).item()

w1 = sign(np.array(model['fc1w']))
w2 = sign(np.array(model['fc2w']))
w3 = sign(np.array(model['fc3w']))

w1 = quantize(w1)
w2 = quantize(w2)
w3 = quantize(w3)

def XNOR(a, b):
	return a == b

def matmul_xnor(A, B):
	a, b = B.shape

	res = np.zeros(a)

	A1 = A.astype(int)
	B1 = B.astype(int)

	for x in range(a):
		cnt = 0
		for y in range(b):
			cnt = cnt + XNOR(A[y], B[x][y])
		res[x] = cnt
	return res

def feed_forward(image):
	image = image / 255.0
	# layer 1
	X0_input = quantize(sign(2 * image - 0.5))
	layer1_output = matmul_xnor(X0_input, w1)
	layer1_activations = (layer1_output * 2 - 784)
    
	# layer 2
	layer2_input = quantize(sign(layer1_activations))
	layer2_output = matmul_xnor(layer2_input, w2)
	layer2_activations = (layer2_output * 2 - 128)

	# layer 3
	layer3_input = quantize(sign(layer2_activations))
	layer3_output = matmul_xnor(layer3_input, w3)

	final_output = (layer3_output * 2 - 64)
	A = np.array([final_output], np.int32)

	return np.argmax(final_output)

mnist = np.load("mnist_test_data.npy", allow_pickle=True)
X = mnist.item().get("data")
y = mnist.item().get("label")

X = np.reshape(X, (10000, 784))

# print(X.shape)
# print(y.shape)

# print(X[0])

N = 10 # Number of images to test
correct = 0
start_time = perf_counter()
for i in range(N):
	prediction = feed_forward(X[i])
	label = y[i]
	print("Predicted: " + str(prediction) + " | Actual: " + str(label))
	correct += prediction == label

print("Accuracy: " + str(correct/N))
time_tracker['software'] = perf_counter() - start_time


Predicted: 7 | Actual: 7
Predicted: 2 | Actual: 2
Predicted: 1 | Actual: 1
Predicted: 0 | Actual: 0
Predicted: 4 | Actual: 4
Predicted: 1 | Actual: 1
Predicted: 4 | Actual: 4
Predicted: 9 | Actual: 9
Predicted: 6 | Actual: 5
Predicted: 9 | Actual: 9
Accuracy: 0.9


## Hardware Implementation - Stream

In [None]:
overlay = Overlay('v3.bit')
print("Loaded overlay!")
bnn_ip = overlay.feedforward_0

Loaded overlay!


### Setup DMA Channels & Buffers

In [3]:
dma = overlay.axi_dma_0
dma_in = dma.sendchannel
dma_out = dma.recvchannel

In [4]:
input_buffer = allocate(shape=(IMG_SIZE_SQ,), dtype=np.int32)
output_buffer = allocate(shape=(10,), dtype=np.int32)

### Run Model

In [None]:
start_time = perf_counter()
for i in range(10):
    actual_label = labels[i]
    np.copyto(input_buffer, image_data[i])
    
    dma_out.transfer(output_buffer)
    dma_in.transfer(input_buffer)
    
    bnn_ip.write(0x00, 0x81)

    dma_in.wait()
    dma_out.wait()

    result = np.array(output_buffer)
    print(result)
    predicted_label = np.argmax(result)
    print(f"The value predicted for this image is: {predicted_label} | expected: {actual_label} | {'success' if predicted_label == actual_label else 'failed'}")
time_tracker['stream_hw'] = perf_counter() - start_time

print(f"Time taken for 10 images: {end_time - start_time:.2f} seconds")

[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 5 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 0 | success
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 4 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 1 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 9 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 2 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 1 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 3 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 1 | failed
[6 6 6 6 6 6 6 6 6 6]
The value predicted for this image is: 0 | expected: 4 | failed


## Hardware Implementation - Burst

In [None]:
overlay = Overlay('v3.bit') # Change the name of the bitstream file as needed
print("Loaded overlay!")
bnn_ip = overlay.feedforward_burst_0 # Change the name of the IP as needed

np.copyto(input_buffer, image_data[0])

### Setup Buffers

In [None]:
input_buffer = allocate(shape=(IMG_SIZE_SQ,), dtype=np.int32)
output_buffer = allocate(shape=(10,), dtype=np.int32)

### Run Model

In [None]:
start_time = perf_counter()

# The addresses are probably wrong
bnn_ip.write(0x10, input_buffer.physical_address)
bnn_ip.write(0x1C, output_buffer.physical_address)

# Start the kernel
bnn_ip.write(0x00, 0x01)

# Wait for completion
while (bnn_ip.read(0x00) & 0x2) == 0:
    pass

# Invalidate result buffer to get fresh values
output_buffer.invalidate()

time_tracker['burst_hw'] = perf_counter() - start_time

print("Predicted Value")
print(output_buffer)
print("Expected Value")
print(y)

## Performanace Tracker Output

In [46]:
print(time_tracker)

{'stream_hw': 0, 'burst_hw': 0, 'software': 0.33010550000471994}
