# Timing analysis: Variable-size matrix multiplication

In [1]:
from pynq import Overlay, allocate
import numpy as np
import mmul

overlay = Overlay("/home/xilinx/overlays/mmul_v2_64.bit")

L, M, N = 20, 21, 22

A = allocate(shape=(L,M), dtype="u4")
B = allocate(shape=(M,N), dtype="u4")

A[:] = np.arange(1,L * M + 1).reshape(L, M)
B[:] = np.arange(1,M * N + 1).reshape(M, N)

In [2]:
%%pybind11 mmul

#include <unistd.h>
#include <fcntl.h>
#include <termios.h>
#include <sys/mman.h>

#define MM2S_DMACR 0x00
#define MM2S_DMACR_RS 0x00000001
#define MM2S_DMACR_Reset 0x00000004
#define MM2S_DMASR 0x04
#define MM2S_DMASR_Halted 0x00000001
#define MM2S_DMASR_Idle 0x00000002
#define MM2S_SA 0x18
#define MM2S_SA_MSB 0x1c
#define MM2S_LENGTH 0x28

#define S2MM_DMACR 0x30
#define S2MM_DMACR_RS 0x00000001
#define S2MM_DMACR_Reset 0x00000004
#define S2MM_DMASR 0x34
#define S2MM_DMASR_Halted 0x00000001
#define S2MM_DMASR_Idle 0x00000002
#define S2MM_DA 0x48
#define S2MM_DA_MSB 0x4c
#define S2MM_LENGTH 0x58

#define printf py::print

#define dma_get(x) DMA_VADDR[x >> 2]
#define dma_set(x, y) DMA_VADDR[x >> 2] = y

#define mmul_set_L(x) MMUL_VADDR[0x10 >> 2] = x
#define mmul_set_M(x) MMUL_VADDR[0x18 >> 2] = x
#define mmul_set_N(x) MMUL_VADDR[0x20 >> 2] = x
#define mmul_start() MMUL_VADDR[0x00 >> 2] = 0x00000001

void mmul(unsigned int DMA_addr, unsigned int MMUL_addr,
          unsigned int L, unsigned int M, unsigned int N,
          unsigned int A_addr, unsigned int BT_addr, unsigned int CT_addr)
{
    int fd = open("/dev/mem", O_RDWR | O_SYNC);
    volatile unsigned int *DMA_VADDR = (volatile unsigned int *) mmap(NULL, 65535, PROT_READ | PROT_WRITE, MAP_SHARED, fd, DMA_addr);
    volatile unsigned int *MMUL_VADDR = (volatile unsigned int *) mmap(NULL, 65535, PROT_READ | PROT_WRITE, MAP_SHARED, fd, MMUL_addr);
    
    mmul_set_L(L);
    mmul_set_M(M);
    mmul_set_N(N);
    mmul_start();
    
    dma_set(S2MM_DMACR, S2MM_DMACR_Reset);
    dma_set(MM2S_DMACR, MM2S_DMACR_Reset);
    
    dma_set(S2MM_DMACR, S2MM_DMACR_RS);
    dma_set(MM2S_DMACR, MM2S_DMACR_RS);
    
    dma_set(MM2S_SA, A_addr);
    dma_set(MM2S_LENGTH, L*M*sizeof(unsigned int));
    
    while(!(dma_get(MM2S_DMASR) & MM2S_DMASR_Idle));
    
    dma_set(MM2S_SA, BT_addr);
    dma_set(MM2S_LENGTH, N*M*sizeof(unsigned int));
    
    dma_set(S2MM_DA, CT_addr);
    dma_set(S2MM_LENGTH, N*L*sizeof(unsigned int));
    
    while(!(dma_get(MM2S_DMASR) & MM2S_DMASR_Idle));
    while(!(dma_get(S2MM_DMASR) & S2MM_DMASR_Idle));
    
    close(fd);
}

In [3]:
%%timeit -r 10 -n 5
A@B

5 loops, best of 10: 2.37 ms per loop


In [4]:
%%timeit -r 10 -n 5
BT = allocate(shape=B.shape[::-1], dtype=B.dtype)
BT[:]=B.T

CT = allocate(shape=(B.shape[1], A.shape[0]), dtype="u4")
mmul.mmul(overlay.axi_dma_0.sendchannel._mmio.base_addr, overlay.mmul_v2_0.mmio.base_addr,
          L, M, N,
          A.physical_address, BT.physical_address, CT.physical_address)

5 loops, best of 10: 41.3 ms per loop


In [5]:
%%timeit -r 10 -n 5
BT = allocate(shape=B.shape[::-1], dtype=B.dtype)
BT[:]=B.T

CT = allocate(shape=(B.shape[1], A.shape[0]), dtype="u4")

5 loops, best of 10: 1.65 ms per loop


In [7]:
BT = allocate(shape=B.shape[::-1], dtype=B.dtype)
BT[:]=B.T

CT = allocate(shape=(B.shape[1], A.shape[0]), dtype="u4")

In [8]:
%%timeit -r 10 -n 5

mmul.mmul(overlay.axi_dma_0.sendchannel._mmio.base_addr, overlay.mmul_v2_0.mmio.base_addr,
          L, M, N,
          A.physical_address, BT.physical_address, CT.physical_address)

5 loops, best of 10: 39.5 ms per loop


In [9]:
%%timeit -r 10 -n 5

mmul.mmul(0x40400000, 0x43c00000,
          L, M, N,
          A.physical_address, BT.physical_address, CT.physical_address)

5 loops, best of 10: 107 µs per loop
