# Initialize the accelerator

In [1]:
from finn_examples import models
print(list(filter(lambda x: "unsw_nb15" in x, dir(models))))

['_unsw_nb15_mlp_io_shape_dict', 'mlp_w2a2_unsw_nb15']


Specify a batch size & create the FINN overlay. Note that the batch size must divide 82000.

In [2]:
batch_size = 1
accel = models.mlp_w2a2_unsw_nb15()

In [3]:
print("Expected input shape and datatype: %s %s" % (str(accel.ishape_normal()), str(accel.idt())))
print("Expected output shape and datatype: %s %s" % (str(accel.oshape_normal()), str(accel.odt())))

Expected input shape and datatype: (1, 600) BIPOLAR
Expected output shape and datatype: (1, 1) BIPOLAR


# Load the binarized UNSW-NB15 test dataset

In [12]:
! wget -nc -O unsw_nb15_binarized.npz https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1

File ‘unsw_nb15_binarized.npz’ already there; not retrieving.


Note that the generated design expects inputs of length 600. As explained in the [end-to-end notebook](https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb) in the FINN repository, padding the input data from length 593 to 600 enables SIMD parallelization for the first layer.
Thus, we'll have to pad our dataset before feeding it to the accelerator.

In [13]:
import numpy as np

def make_unsw_nb15_test_batches(bsize):
    unsw_nb15_data = np.load("unsw_nb15_binarized.npz")["test"][:82000]
    test_imgs = unsw_nb15_data[:, :-1]
    test_imgs = np.pad(test_imgs, [(0, 0), [0, 7]], mode="constant")
    test_labels = unsw_nb15_data[:, -1]
    n_batches = int(test_imgs.shape[0] / bsize)
    test_imgs = test_imgs.reshape(n_batches, bsize, -1)
    test_labels = test_labels.reshape(n_batches, bsize)
    return (test_imgs, test_labels)

# Classify a single attack

In [14]:
(test_imgs, test_labels) = make_unsw_nb15_test_batches(bsize=1)

In [39]:
test_single = test_imgs[-1]
test_single_label = test_labels[-1].astype(np.float32)

print("Expected label is: %d (%s data)" % (test_single_label, (lambda x: "normal" if x==0 else "abnormal")(test_single_label)))

Expected label is 0 (normal data)


In [40]:
# Note: the accelerator expects binary input data presented in bipolar form (i.e. {-1, 1})
accel_in = 2 * test_single - 1
accel_out = accel.execute(accel_in)
# To convert back to the original label (i.e. {0, 1}), we'll have to map the bipolar output to binary
accel_out_binary = (accel_out + 1) / 2

In [42]:
print("Returned label is: %d (%s data)" % (accel_out_binary, (lambda x: "normal" if x==0 else "abnormal")(accel_out_binary)))

Returned label is 0 (normal data)


# Validate accuracy on 82000 (out of 82332) records from UNSW-NB15 test set

To increase the throughput, let's increase the batch size. Note that the FINN accelerator operates on a batch size of 1, but to fill the compute pipeline, we'll copy a greater chunk of the test set to the device buffer.

In [66]:
batch_size = 1000
accel.batch_size = batch_size
(test_imgs, test_labels) = make_unsw_nb15_test_batches(batch_size)

In [67]:
ok = 0
nok = 0
n_batches = test_imgs.shape[0]
total = batch_size*n_batches

In [68]:
for i in range(n_batches):
    inp = test_imgs[i].astype(np.float32)
    exp = test_labels[i].astype(np.float32)
    inp = 2 * inp - 1
    exp = 2 * exp - 1
    out = accel.execute(inp)
    matches = np.count_nonzero(out.flatten() == exp.flatten())
    nok += batch_size - matches
    ok += matches
    print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))

batch 1 / 82 : total OK 866 NOK 134
batch 2 / 82 : total OK 1706 NOK 294
batch 3 / 82 : total OK 2607 NOK 393
batch 4 / 82 : total OK 3490 NOK 510
batch 5 / 82 : total OK 4438 NOK 562
batch 6 / 82 : total OK 5380 NOK 620
batch 7 / 82 : total OK 6290 NOK 710
batch 8 / 82 : total OK 7261 NOK 739
batch 9 / 82 : total OK 8174 NOK 826
batch 10 / 82 : total OK 9109 NOK 891
batch 11 / 82 : total OK 10026 NOK 974
batch 12 / 82 : total OK 10963 NOK 1037
batch 13 / 82 : total OK 11955 NOK 1045
batch 14 / 82 : total OK 12950 NOK 1050
batch 15 / 82 : total OK 13948 NOK 1052
batch 16 / 82 : total OK 14948 NOK 1052
batch 17 / 82 : total OK 15947 NOK 1053
batch 18 / 82 : total OK 16947 NOK 1053
batch 19 / 82 : total OK 17947 NOK 1053
batch 20 / 82 : total OK 18946 NOK 1054
batch 21 / 82 : total OK 19946 NOK 1054
batch 22 / 82 : total OK 20945 NOK 1055
batch 23 / 82 : total OK 21942 NOK 1058
batch 24 / 82 : total OK 22939 NOK 1061
batch 25 / 82 : total OK 23938 NOK 1062
batch 26 / 82 : total OK 24938 

In [69]:
acc = 100.0 * ok / (total)
print("Final accuracy: {:.2f}%".format(acc))

Final accuracy: 91.898780


In [71]:
def run_validation():
    for i in range(n_batches):
        ibuf_normal = test_imgs[i].reshape(accel.ishape_normal())
        accel.execute(ibuf_normal)

In [72]:
full_validation_time = %timeit -n 1 -o run_validation()

33.3 s ± 698 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
print("%f images per second including data movement" % (total / float(full_validation_time.best)))

2542.157784 images per second including data movement


# More benchmarking

In [74]:
accel.throughput_test()

{'runtime[ms]': 1.0788440704345703,
 'throughput[images/s]': 926918.0110497237,
 'DRAM_in_bandwidth[MB/s]': 69.51885082872928,
 'DRAM_out_bandwidth[MB/s]': 0.9269180110497238,
 'fclk[mhz]': 100.0,
 'batch_size': 1000,
 'fold_input[ms]': 0.09775161743164062,
 'pack_input[ms]': 71.11644744873047,
 'copy_input_data_to_device[ms]': 2.642393112182617,
 'copy_output_data_from_device[ms]': 0.2548694610595703,
 'unpack_output[ms]': 355.4694652557373,
 'unfold_output[ms]': 0.05626678466796875}

The measured `throughput` of the accelerator, excluding any software and data movement overhead, is influenced by the batch size. The more we fill the compute pipeline, the higher the throughput.
Note that the total runtime consists of the overhead of packing/unpacking the inputs/outputs to convert form numpy arrays to the bit-contiguous data representation our accelerator expectes (`pack_input`/`unpack_output`), the cost of moving data between the CPU and accelerator memories (`copy_input_data_to_device`/`copy_output_data_from_device`), as well as the accelerator's execution time.