# DMA loopback

This example provides insight into the streaming latencies when using DMA.

## DMA configuration 1

|Parameter | Value |
| --- | --- |
|Scatter-gather |Disabled |
|Buffer length register width |14 bits |
|Memory map data width |32 |
|Stream data width |32 |
|Max burst size |8 |

In [1]:
from pynq import Overlay, allocate
import numpy as np

overlay = Overlay("/home/xilinx/overlays/dma_loopback_1.bit")
overlay.download()

In [2]:
A = allocate(shape=(2**1,), dtype=np.uint32)
B = allocate(shape=(2**1,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [3]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.5 ms per loop


In [4]:
A = allocate(shape=(2**2,), dtype=np.uint32)
B = allocate(shape=(2**2,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [5]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.8 ms per loop


In [6]:
A = allocate(shape=(2**3,), dtype=np.uint32)
B = allocate(shape=(2**3,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [7]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.7 ms per loop


In [8]:
A = allocate(shape=(2**4,), dtype=np.uint32)
B = allocate(shape=(2**4,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [9]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.7 ms per loop


In [10]:
A = allocate(shape=(2**5,), dtype=np.uint32)
B = allocate(shape=(2**5,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [11]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.7 ms per loop


In [12]:
A = allocate(shape=(2**6,), dtype=np.uint32)
B = allocate(shape=(2**6,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [13]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.8 ms per loop


In [14]:
A = allocate(shape=(2**7,), dtype=np.uint32)
B = allocate(shape=(2**7,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [15]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.8 ms per loop


In [16]:
A = allocate(shape=(2**8,), dtype=np.uint32)
B = allocate(shape=(2**8,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [17]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.8 ms per loop


In [18]:
A = allocate(shape=(2**9,), dtype=np.uint32)
B = allocate(shape=(2**9,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [19]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.9 ms per loop


In [20]:
A = allocate(shape=(2**10,), dtype=np.uint32)
B = allocate(shape=(2**10,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [21]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


In [22]:
A = allocate(shape=(2**11,), dtype=np.uint32)
B = allocate(shape=(2**11,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [23]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.9 ms per loop


## DMA configuration 2

|Parameter | Value |
| --- | --- |
|Scatter-gather |Disabled |
|Buffer length register width |14 bits |
|Memory map data width |64 |
|Stream data width |64 |
|Max burst size |8 |

In [24]:
from pynq import Overlay, allocate
import numpy as np

overlay = Overlay("/home/xilinx/overlays/dma_loopback_1.bit")
overlay.download()

In [25]:
A = allocate(shape=(2**1,), dtype=np.uint32)
B = allocate(shape=(2**1,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [26]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.9 ms per loop


In [27]:
A = allocate(shape=(2**2,), dtype=np.uint32)
B = allocate(shape=(2**2,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [28]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.9 ms per loop


In [29]:
A = allocate(shape=(2**3,), dtype=np.uint32)
B = allocate(shape=(2**3,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [30]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


In [31]:
A = allocate(shape=(2**4,), dtype=np.uint32)
B = allocate(shape=(2**4,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [32]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


In [33]:
A = allocate(shape=(2**5,), dtype=np.uint32)
B = allocate(shape=(2**5,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [34]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.9 ms per loop


In [35]:
A = allocate(shape=(2**6,), dtype=np.uint32)
B = allocate(shape=(2**6,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [36]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


In [37]:
A = allocate(shape=(2**7,), dtype=np.uint32)
B = allocate(shape=(2**7,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [38]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


In [39]:
A = allocate(shape=(2**8,), dtype=np.uint32)
B = allocate(shape=(2**8,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [40]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 53.9 ms per loop


In [41]:
A = allocate(shape=(2**9,), dtype=np.uint32)
B = allocate(shape=(2**9,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [42]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.1 ms per loop


In [43]:
A = allocate(shape=(2**10,), dtype=np.uint32)
B = allocate(shape=(2**10,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [44]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


In [45]:
A = allocate(shape=(2**11,), dtype=np.uint32)
B = allocate(shape=(2**11,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [46]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54 ms per loop


## DMA configuration 3

|Parameter | Value |
| --- | --- |
|Scatter-gather |Disabled |
|Buffer length register width |24 bits |
|Memory map data width |64 |
|Stream data width |64 |
|Max burst size |8 |

In [47]:
from pynq import Overlay, allocate
import numpy as np

overlay = Overlay("/home/xilinx/overlays/dma_loopback_3.bit")
overlay.download()

In [48]:
A = allocate(shape=(2**1,), dtype=np.uint32)
B = allocate(shape=(2**1,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [49]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.4 ms per loop


In [50]:
A = allocate(shape=(2**2,), dtype=np.uint32)
B = allocate(shape=(2**2,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [51]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.3 ms per loop


In [52]:
A = allocate(shape=(2**3,), dtype=np.uint32)
B = allocate(shape=(2**3,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [53]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.4 ms per loop


In [54]:
A = allocate(shape=(2**4,), dtype=np.uint32)
B = allocate(shape=(2**4,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [55]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.5 ms per loop


In [56]:
A = allocate(shape=(2**5,), dtype=np.uint32)
B = allocate(shape=(2**5,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [57]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.4 ms per loop


In [58]:
A = allocate(shape=(2**6,), dtype=np.uint32)
B = allocate(shape=(2**6,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [59]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.5 ms per loop


In [60]:
A = allocate(shape=(2**7,), dtype=np.uint32)
B = allocate(shape=(2**7,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [61]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.4 ms per loop


In [62]:
A = allocate(shape=(2**8,), dtype=np.uint32)
B = allocate(shape=(2**8,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [63]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.4 ms per loop


In [64]:
A = allocate(shape=(2**9,), dtype=np.uint32)
B = allocate(shape=(2**9,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [65]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.5 ms per loop


In [66]:
A = allocate(shape=(2**10,), dtype=np.uint32)
B = allocate(shape=(2**10,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [67]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.5 ms per loop


In [68]:
A = allocate(shape=(2**11,), dtype=np.uint32)
B = allocate(shape=(2**11,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [69]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.5 ms per loop


In [70]:
A = allocate(shape=(2**12,), dtype=np.uint32)
B = allocate(shape=(2**12,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [71]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.6 ms per loop


In [72]:
A = allocate(shape=(2**13,), dtype=np.uint32)
B = allocate(shape=(2**13,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [73]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.6 ms per loop


In [74]:
A = allocate(shape=(2**14,), dtype=np.uint32)
B = allocate(shape=(2**14,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [75]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.5 ms per loop


In [76]:
A = allocate(shape=(2**15,), dtype=np.uint32)
B = allocate(shape=(2**15,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [77]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.6 ms per loop


In [78]:
A = allocate(shape=(2**16,), dtype=np.uint32)
B = allocate(shape=(2**16,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [79]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.6 ms per loop


In [80]:
A = allocate(shape=(2**17,), dtype=np.uint32)
B = allocate(shape=(2**17,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [81]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.6 ms per loop


In [82]:
A = allocate(shape=(2**18,), dtype=np.uint32)
B = allocate(shape=(2**18,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [83]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.9 ms per loop


In [84]:
A = allocate(shape=(2**19,), dtype=np.uint32)
B = allocate(shape=(2**19,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [85]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 54.8 ms per loop


In [86]:
A = allocate(shape=(2**20,), dtype=np.uint32)
B = allocate(shape=(2**20,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [87]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 55.7 ms per loop


In [88]:
A = allocate(shape=(2**21,), dtype=np.uint32)
B = allocate(shape=(2**21,), dtype=np.uint32)

A[:] = np.random.randint(100000, size=A.shape)

In [89]:
%%timeit -n 250 -r 3

overlay.axi_dma_0.sendchannel.transfer(A)
overlay.axi_dma_0.recvchannel.transfer(B)
overlay.axi_dma_0.recvchannel.wait()

250 loops, best of 3: 57 ms per loop


# Matrix multiplication for reference

In [90]:
A = np.random.randint(100000, size=(2**1,2**1), dtype=np.uint32)
B = np.random.randint(100000, size=(2**1,2**1), dtype=np.uint32)

In [91]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 20.9 µs per loop


In [92]:
A = np.random.randint(100000, size=(2**2,2**2), dtype=np.uint32)
B = np.random.randint(100000, size=(2**2,2**2), dtype=np.uint32)

In [93]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 22 µs per loop


In [94]:
A = np.random.randint(100000, size=(2**3,2**3), dtype=np.uint32)
B = np.random.randint(100000, size=(2**3,2**3), dtype=np.uint32)

In [95]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 27.4 µs per loop


In [96]:
A = np.random.randint(100000, size=(2**4,2**4), dtype=np.uint32)
B = np.random.randint(100000, size=(2**4,2**4), dtype=np.uint32)

In [97]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 64.7 µs per loop


In [98]:
A = np.random.randint(100000, size=(2**5,2**5), dtype=np.uint32)
B = np.random.randint(100000, size=(2**5,2**5), dtype=np.uint32)

In [99]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 317 µs per loop


In [100]:
A = np.random.randint(100000, size=(2**1,2**1), dtype=np.uint32)
B = np.random.randint(100000, size=(2**1,2**1), dtype=np.uint32)

In [101]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 24.9 µs per loop


In [102]:
A = np.random.randint(100000, size=(2**6,2**6), dtype=np.uint32)
B = np.random.randint(100000, size=(2**6,2**6), dtype=np.uint32)

In [103]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 2.26 ms per loop


In [104]:
A = np.random.randint(100000, size=(2**7,2**7), dtype=np.uint32)
B = np.random.randint(100000, size=(2**7,2**7), dtype=np.uint32)

In [105]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 56 ms per loop


In [106]:
A = np.random.randint(100000, size=(2**8,2**8), dtype=np.uint32)
B = np.random.randint(100000, size=(2**8,2**8), dtype=np.uint32)

In [107]:
%%timeit -n 250 -r 3

A@B

250 loops, best of 3: 546 ms per loop


In [108]:
A = np.random.randint(100000, size=(2**9,2**9), dtype=np.uint32)
B = np.random.randint(100000, size=(2**9,2**9), dtype=np.uint32)

In [109]:
%%timeit -n 5 -r 3

A@B

5 loops, best of 3: 11.7 s per loop


In [110]:
A = np.random.randint(100000, size=(2**10,2**10), dtype=np.uint32)
B = np.random.randint(100000, size=(2**10,2**10), dtype=np.uint32)

In [111]:
%%timeit

A@B

KeyboardInterrupt: 

In [None]:
A = np.random.randint(100000, size=(2**11,2**11), dtype=np.uint32)
B = np.random.randint(100000, size=(2**11,2**11), dtype=np.uint32)

In [None]:
%%timeit

A@B

In [None]:
A = np.random.randint(100000, size=(2**12,2**12), dtype=np.uint32)
B = np.random.randint(100000, size=(2**12,2**12), dtype=np.uint32)

In [None]:
%%timeit

A@B