# MMIO example: Variable-size matrix multiplication

This notebook illustrates the use of MMIO to communicate with the PL part of the PYNQ-Z2 board. This can be achieved at different levels of abstraction and speedup: (i) using PYNQ, (ii) using MMIO in Python and (iii) using MMIO in C++ with pybind11. The block design consists of a DMA component that interfaces with a HLS matrix multiplication IP that operates on streams.

First, we load the bitstream, allocate and fill the source buffers ``A``/``B``, and allocate the destination buffer ``C``:

In [1]:
from pynq import Overlay, allocate
import numpy as np

overlay = Overlay("/home/xilinx/overlays/mmul_v2_64.bit")

L, M, N = 8, 9, 10

A = allocate(shape=(L,M), dtype="u4")
B = allocate(shape=(M,N), dtype="u4")

A[:] = np.arange(1,L * M + 1).reshape(L, M)
B[:] = np.arange(1,M * N + 1).reshape(M, N)

# (c) C++ (``pybind11``, ``mmap``)

## 1.i Memory allocation

To minimize the streaming latency, the `mmul_v2_0` component actually calculates `A@B.T=C.T`. Therefore we have to re-allocate `B` and treat `C` as being transposed.

In [2]:
BT = allocate(shape=B.shape[::-1], dtype=B.dtype)
BT[:]=B.T

CT = allocate(shape=(B.shape[1], A.shape[0]), dtype="u4")

Let's compare the memory layout of `B` and `BT`:

In [3]:
import subprocess
cmd = f"hexdump -C -s {B.physical_address} /dev/mem | head"
print(B)
print(subprocess.check_output(cmd, shell=True).decode("utf-8"))

[[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]
 [21 22 23 24 25 26 27 28 29 30]
 [31 32 33 34 35 36 37 38 39 40]
 [41 42 43 44 45 46 47 48 49 50]
 [51 52 53 54 55 56 57 58 59 60]
 [61 62 63 64 65 66 67 68 69 70]
 [71 72 73 74 75 76 77 78 79 80]
 [81 82 83 84 85 86 87 88 89 90]]
16865000  01 00 00 00 02 00 00 00  03 00 00 00 04 00 00 00  |................|
16865010  05 00 00 00 06 00 00 00  07 00 00 00 08 00 00 00  |................|
16865020  09 00 00 00 0a 00 00 00  0b 00 00 00 0c 00 00 00  |................|
16865030  0d 00 00 00 0e 00 00 00  0f 00 00 00 10 00 00 00  |................|
16865040  11 00 00 00 12 00 00 00  13 00 00 00 14 00 00 00  |................|
16865050  15 00 00 00 16 00 00 00  17 00 00 00 18 00 00 00  |................|
16865060  19 00 00 00 1a 00 00 00  1b 00 00 00 1c 00 00 00  |................|
16865070  1d 00 00 00 1e 00 00 00  1f 00 00 00 20 00 00 00  |............ ...|
16865080  21 00 00 00 22 00 00 00  23 00 00 00 24 00 00 00  |!..."...#

In [4]:
import subprocess
cmd = f"hexdump -C -s {BT.physical_address} /dev/mem | head"
print(BT)
print(subprocess.check_output(cmd, shell=True).decode("utf-8"))

[[ 1 11 21 31 41 51 61 71 81]
 [ 2 12 22 32 42 52 62 72 82]
 [ 3 13 23 33 43 53 63 73 83]
 [ 4 14 24 34 44 54 64 74 84]
 [ 5 15 25 35 45 55 65 75 85]
 [ 6 16 26 36 46 56 66 76 86]
 [ 7 17 27 37 47 57 67 77 87]
 [ 8 18 28 38 48 58 68 78 88]
 [ 9 19 29 39 49 59 69 79 89]
 [10 20 30 40 50 60 70 80 90]]
16866000  01 00 00 00 0b 00 00 00  15 00 00 00 1f 00 00 00  |................|
16866010  29 00 00 00 33 00 00 00  3d 00 00 00 47 00 00 00  |)...3...=...G...|
16866020  51 00 00 00 02 00 00 00  0c 00 00 00 16 00 00 00  |Q...............|
16866030  20 00 00 00 2a 00 00 00  34 00 00 00 3e 00 00 00  | ...*...4...>...|
16866040  48 00 00 00 52 00 00 00  03 00 00 00 0d 00 00 00  |H...R...........|
16866050  17 00 00 00 21 00 00 00  2b 00 00 00 35 00 00 00  |....!...+...5...|
16866060  3f 00 00 00 49 00 00 00  53 00 00 00 04 00 00 00  |?...I...S.......|
16866070  0e 00 00 00 18 00 00 00  22 00 00 00 2c 00 00 00  |........"...,...|
16866080  36 00 00 00 40 00 00 00  4a 00 00 00 54 00 00 00  |6...@.

As we can see, the memory layout of `BT` is suitable for streaming it with DMA. Else, we would have to send each element of B as a separate DMA transfer, resulting in a very slow transfer. Of course, we have to take into account the cost of transposing the array.

## 1.ii Hardware execution

In [5]:
%%pybind11 mmul

#include <unistd.h>
#include <fcntl.h>
#include <termios.h>
#include <sys/mman.h>

#define MM2S_DMACR 0x00
#define MM2S_DMACR_RS 0x00000001
#define MM2S_DMACR_Reset 0x00000004
#define MM2S_DMASR 0x04
#define MM2S_DMASR_Halted 0x00000001
#define MM2S_DMASR_Idle 0x00000002
#define MM2S_SA 0x18
#define MM2S_SA_MSB 0x1c
#define MM2S_LENGTH 0x28

#define S2MM_DMACR 0x30
#define S2MM_DMACR_RS 0x00000001
#define S2MM_DMACR_Reset 0x00000004
#define S2MM_DMASR 0x34
#define S2MM_DMASR_Halted 0x00000001
#define S2MM_DMASR_Idle 0x00000002
#define S2MM_DA 0x48
#define S2MM_DA_MSB 0x4c
#define S2MM_LENGTH 0x58

#define printf py::print

#define dma_get(x) DMA_VADDR[x >> 2]
#define dma_set(x, y) DMA_VADDR[x >> 2] = y

#define mmul_set_L(x) MMUL_VADDR[0x10 >> 2] = x
#define mmul_set_M(x) MMUL_VADDR[0x18 >> 2] = x
#define mmul_set_N(x) MMUL_VADDR[0x20 >> 2] = x
#define mmul_start() MMUL_VADDR[0x00 >> 2] = 0x00000001

void mmul(unsigned int DMA_addr, unsigned int MMUL_addr,
          unsigned int L, unsigned int M, unsigned int N,
          unsigned int A_addr, unsigned int BT_addr, unsigned int CT_addr)
{
    int fd = open("/dev/mem", O_RDWR | O_SYNC);
    volatile unsigned int *DMA_VADDR = (volatile unsigned int *) mmap(NULL, 65535, PROT_READ | PROT_WRITE, MAP_SHARED, fd, DMA_addr);
    volatile unsigned int *MMUL_VADDR = (volatile unsigned int *) mmap(NULL, 65535, PROT_READ | PROT_WRITE, MAP_SHARED, fd, MMUL_addr);
    
    mmul_set_L(L);
    mmul_set_M(M);
    mmul_set_N(N);
    mmul_start();
    
    dma_set(S2MM_DMACR, S2MM_DMACR_Reset);
    dma_set(MM2S_DMACR, MM2S_DMACR_Reset);
    
    dma_set(S2MM_DMACR, S2MM_DMACR_RS);
    dma_set(MM2S_DMACR, MM2S_DMACR_RS);
    
    dma_set(MM2S_SA, A_addr);
    dma_set(MM2S_LENGTH, L*M*sizeof(unsigned int));
    
    while(!(dma_get(MM2S_DMASR) & MM2S_DMASR_Idle));
    
    dma_set(MM2S_SA, BT_addr);
    dma_set(MM2S_LENGTH, N*M*sizeof(unsigned int));
    
    dma_set(S2MM_DA, CT_addr);
    dma_set(S2MM_LENGTH, N*L*sizeof(unsigned int));
    
    while(!(dma_get(MM2S_DMASR) & MM2S_DMASR_Idle));
    while(!(dma_get(S2MM_DMASR) & S2MM_DMASR_Idle));
    
    close(fd);
}

We now clear the ``B`` buffer and run the actual C++ extension that starts the DMA transfer. We see that the data is successfully copied into ``B`` when the DMA transfers are done:

In [6]:
CT[:] = 0
print(A)
print(BT)
print(CT)

[[ 1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18]
 [19 20 21 22 23 24 25 26 27]
 [28 29 30 31 32 33 34 35 36]
 [37 38 39 40 41 42 43 44 45]
 [46 47 48 49 50 51 52 53 54]
 [55 56 57 58 59 60 61 62 63]
 [64 65 66 67 68 69 70 71 72]]
[[ 1 11 21 31 41 51 61 71 81]
 [ 2 12 22 32 42 52 62 72 82]
 [ 3 13 23 33 43 53 63 73 83]
 [ 4 14 24 34 44 54 64 74 84]
 [ 5 15 25 35 45 55 65 75 85]
 [ 6 16 26 36 46 56 66 76 86]
 [ 7 17 27 37 47 57 67 77 87]
 [ 8 18 28 38 48 58 68 78 88]
 [ 9 19 29 39 49 59 69 79 89]
 [10 20 30 40 50 60 70 80 90]]
[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [7]:
import mmul

mmul.mmul(overlay.axi_dma_0.sendchannel._mmio.base_addr, overlay.mmul_v2_0.mmio.base_addr,
          L, M, N,
          A.physical_address, BT.physical_address, CT.physical_address)

print(A)
print(BT)
print(CT)

[[ 1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18]
 [19 20 21 22 23 24 25 26 27]
 [28 29 30 31 32 33 34 35 36]
 [37 38 39 40 41 42 43 44 45]
 [46 47 48 49 50 51 52 53 54]
 [55 56 57 58 59 60 61 62 63]
 [64 65 66 67 68 69 70 71 72]]
[[ 1 11 21 31 41 51 61 71 81]
 [ 2 12 22 32 42 52 62 72 82]
 [ 3 13 23 33 43 53 63 73 83]
 [ 4 14 24 34 44 54 64 74 84]
 [ 5 15 25 35 45 55 65 75 85]
 [ 6 16 26 36 46 56 66 76 86]
 [ 7 17 27 37 47 57 67 77 87]
 [ 8 18 28 38 48 58 68 78 88]
 [ 9 19 29 39 49 59 69 79 89]
 [10 20 30 40 50 60 70 80 90]]
[[ 2445  5766  9087 12408 15729 19050 22371 25692]
 [ 2490  5892  9294 12696 16098 19500 22902 26304]
 [ 2535  6018  9501 12984 16467 19950 23433 26916]
 [ 2580  6144  9708 13272 16836 20400 23964 27528]
 [ 2625  6270  9915 13560 17205 20850 24495 28140]
 [ 2670  6396 10122 13848 17574 21300 25026 28752]
 [ 2715  6522 10329 14136 17943 21750 25557 29364]
 [ 2760  6648 10536 14424 18312 22200 26088 29976]
 [ 2805  6774 10743 14712 18681 22650 26619 30588]

In [8]:
import subprocess
cmd = f"hexdump -C -s {CT.physical_address} /dev/mem | head"
print(CT)
print(subprocess.check_output(cmd, shell=True).decode("utf-8"))

[[ 2445  5766  9087 12408 15729 19050 22371 25692]
 [ 2490  5892  9294 12696 16098 19500 22902 26304]
 [ 2535  6018  9501 12984 16467 19950 23433 26916]
 [ 2580  6144  9708 13272 16836 20400 23964 27528]
 [ 2625  6270  9915 13560 17205 20850 24495 28140]
 [ 2670  6396 10122 13848 17574 21300 25026 28752]
 [ 2715  6522 10329 14136 17943 21750 25557 29364]
 [ 2760  6648 10536 14424 18312 22200 26088 29976]
 [ 2805  6774 10743 14712 18681 22650 26619 30588]
 [ 2850  6900 10950 15000 19050 23100 27150 31200]]
16867000  8d 09 00 00 86 16 00 00  7f 23 00 00 78 30 00 00  |.........#..x0..|
16867010  71 3d 00 00 6a 4a 00 00  63 57 00 00 5c 64 00 00  |q=..jJ..cW..\d..|
16867020  ba 09 00 00 04 17 00 00  4e 24 00 00 98 31 00 00  |........N$...1..|
16867030  e2 3e 00 00 2c 4c 00 00  76 59 00 00 c0 66 00 00  |.>..,L..vY...f..|
16867040  e7 09 00 00 82 17 00 00  1d 25 00 00 b8 32 00 00  |.........%...2..|
16867050  53 40 00 00 ee 4d 00 00  89 5b 00 00 24 69 00 00  |S@...M...[..$i..|
16867060  14 0a

In [9]:
CT_ref = allocate(shape=(B.shape[1], A.shape[0]), dtype="u4")
CT_ref[:] = (A@B).T

In [10]:
import subprocess
cmd = f"hexdump -C -s {CT_ref.physical_address} /dev/mem | head"
print(CT_ref)
print(subprocess.check_output(cmd, shell=True).decode("utf-8"))

[[ 2445  5766  9087 12408 15729 19050 22371 25692]
 [ 2490  5892  9294 12696 16098 19500 22902 26304]
 [ 2535  6018  9501 12984 16467 19950 23433 26916]
 [ 2580  6144  9708 13272 16836 20400 23964 27528]
 [ 2625  6270  9915 13560 17205 20850 24495 28140]
 [ 2670  6396 10122 13848 17574 21300 25026 28752]
 [ 2715  6522 10329 14136 17943 21750 25557 29364]
 [ 2760  6648 10536 14424 18312 22200 26088 29976]
 [ 2805  6774 10743 14712 18681 22650 26619 30588]
 [ 2850  6900 10950 15000 19050 23100 27150 31200]]
1686c000  8d 09 00 00 86 16 00 00  7f 23 00 00 78 30 00 00  |.........#..x0..|
1686c010  71 3d 00 00 6a 4a 00 00  63 57 00 00 5c 64 00 00  |q=..jJ..cW..\d..|
1686c020  ba 09 00 00 04 17 00 00  4e 24 00 00 98 31 00 00  |........N$...1..|
1686c030  e2 3e 00 00 2c 4c 00 00  76 59 00 00 c0 66 00 00  |.>..,L..vY...f..|
1686c040  e7 09 00 00 82 17 00 00  1d 25 00 00 b8 32 00 00  |.........%...2..|
1686c050  53 40 00 00 ee 4d 00 00  89 5b 00 00 24 69 00 00  |S@...M...[..$i..|
1686c060  14 0a