In [23]:
from pynq import DefaultIP
import numpy as np
from pynq import allocate
import time

class MatMulDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:user:matmul:1.0']
    
    def reset(self):
        matmul.write(0x0,0)
        matmul.write(0x4,0)

    def start(self):
        #trigger adder by writing 1 to "start" register
        matmul.write(0x0,1)     
    
    def is_done(self):
        #read the value in the "done" register
        return matmul.read(0x4)
    
    def clear_done(self):
        #write 0 to the "start" register to clear it
        matmul.write(0x0, 0)
        #write 1 to the "done" register to clear it
        matmul.write(0x4, 1)
        
    def current_state(self):
        return matmul.read(0x14)
    
    def check_sanity(self):
        return hex(matmul.read(0x24))

class BramDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:ip:axi_bram_ctrl:4.1']
    
    def write_a(self, a):
        bram_a.write(0,  int((a[3,0]<<24) + (a[2,0]<<16) + (a[1,0]<<8) + (a[0,0])))
        bram_a.write(4,  int((a[3,1]<<24) + (a[2,1]<<16) + (a[1,1]<<8) + (a[0,1])))
        bram_a.write(8,  int((a[3,2]<<24) + (a[2,2]<<16) + (a[1,2]<<8) + (a[0,2])))
        bram_a.write(12, int((a[3,3]<<24) + (a[2,3]<<16) + (a[1,3]<<8) + (a[0,3])))
        
    def write_b(self, b):
        bram_b.write(0,  int((b[0,3]<<24) + (b[0,2]<<16) + (b[0,1]<<8) + (b[0,0])))
        bram_b.write(4,  int((b[1,3]<<24) + (b[1,2]<<16) + (b[1,1]<<8) + (b[1,0])))
        bram_b.write(8,  int((b[2,3]<<24) + (b[2,2]<<16) + (b[2,1]<<8) + (b[2,0])))
        bram_b.write(12, int((b[3,3]<<24) + (b[3,2]<<16) + (b[3,1]<<8) + (b[3,0])))
        
    def read_c(self):
        c = np.ndarray([4,4], dtype=np.uint8)
        for i in range(0,4):
            val = bram_c.read(4*i)
            c[i,0] = ((val & 0x000000ff)>>0)
            c[i,1] = ((val & 0x0000ff00)>>8)
            c[i,2] = ((val & 0x00ff0000)>>16)
            c[i,3] = ((val & 0xff000000)>>24)
        return c     

class CDMADriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:ip:axi_cdma:4.1']
    
    def reset(self):
        dma.register_map.CDMACR = 0x0004
    
    def do_transfer(self, src_addr, dst_addr, nbytes):
        dma.register_map.CDMACR = 0x0004 #reset the DMA    
        dma.register_map.SA = src_addr #set source address
        dma_mmio.write(dma.register_map.DA.address, dst_addr)  #set destination address
        dma.register_map.BTT = nbytes #set number of bytes to transfer and also trigger the DMA
        while (dma.register_map.CDMASR[1]==0): #loop until bit 1 (IDLE) is 0
            pass   
        
    def is_idle(self):
        return (dma.register_map.CDMASR[1]==1)

In [24]:
from pynq import Overlay
from pynq import MMIO
overlay = Overlay('/home/aman/overlays/design_1.bit')
matmul = overlay.matmul_0
bram_a = overlay.axi_bram_ctrl_a
bram_b = overlay.axi_bram_ctrl_b
bram_c = overlay.axi_bram_ctrl_c
dma = overlay.axi_cdma_0
dma_mmio = MMIO(dma.mmio.base_addr, 0xffff)

In [25]:
def do_matmul(a,b,c):    
    matmul.reset()
    dma.reset()
    dma.do_transfer(a.device_address, 0x40000000, a.nbytes) #sent to bram_a
    dma.do_transfer(b.device_address, 0x42000000, b.nbytes) #sent to bram_b
    #bram_a.write_a(a)
    #bram_b.write_b(b)    
    matmul.start()
    while not matmul.is_done():
        pass
    matmul.clear_done()
    dma.do_transfer(0x43000000, c.device_address, c.nbytes) #bring from bram_b
    #c = bram_c.read_c()

In [34]:
import random
a = allocate(shape=(4,4), dtype=np.uint8)
b = allocate(shape=(4,4), dtype=np.uint8)
c = allocate(shape=(4,4), dtype=np.uint8)
for i in range(4):
    for j in range(4):
        a[i,j] = random.randint(0,5)
        b[i,j] = random.randint(0,5)
print(np.transpose(a)) #Think of the transposed matrix as the actual input matrix. 
                       #In the BRAM, we are storing the non-transposed matrix, which
                       #basically means we are storing the actual input matrix in col major order
print(a.nbytes)
print(b)   
print(b.nbytes)
print(c)   
print(c.nbytes)

#a = np.random.randint(low=0, high=5,size=(4,4), dtype=np.uint8)
#print("a=",a)
#b = np.random.randint(low=0, high=5,size=(4,4), dtype=np.uint8)
#print("b=",b)

[[4 0 2 4]
 [2 2 4 0]
 [0 1 1 2]
 [0 0 3 1]]
16
[[5 0 4 0]
 [0 4 0 4]
 [2 1 3 5]
 [3 1 5 3]]
16
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
16


In [35]:
print("Result from overlay running on fpga:")
start_time = time.time()
do_matmul(a,b,c)
end_time = time.time()
print(c)
print("Time taken = ", (end_time-start_time))

Result from overlay running on fpga:
[[36  6 42 22]
 [18 12 20 28]
 [ 8  7 13 15]
 [ 9  4 14 18]]
Time taken =  0.0037488937377929688


In [38]:
print("Result from numpy running on cpu:")
start_time = time.time()
c_cpu = np.matmul(np.transpose(a),b) #need to use transposed matrix, because that's the actual input matrix.
end_time = time.time()
print("c=", c_cpu)
print("Time taken = ", (end_time-start_time))

Result from numpy running on cpu:
c= [[36  6 42 22]
 [18 12 20 28]
 [ 8  7 13 15]
 [ 9  4 14 18]]
Time taken =  0.0010440349578857422


In [None]:
#That's it

In [32]:
np.transpose(a)

PynqBuffer([[4, 4, 1, 3],
            [1, 4, 4, 0],
            [0, 1, 3, 4],
            [2, 4, 0, 3]], dtype=uint8)

In [None]:
matmul = overlay.matmul_0

In [None]:
bram_a = overlay.axi_bram_ctrl_a
bram_b = overlay.axi_bram_ctrl_b
bram_c = overlay.axi_bram_ctrl_c

In [None]:
#initialize brams
for i in range(4):
    #bram_a.write(i*4,i+1000) 
    #bram_b.write(i*4,i+2000)
    bram_c.write(i*4,i+3000)
    
bram_a.write(0, int('0x09050308',16))
bram_a.write(4, int('0x01020304',16))
bram_a.write(8, int('0x00010306',16))
bram_a.write(12, int('0x05060708',16))
bram_a.write(8188,int('0x00000000',16))

bram_b.write(0, int('0x00030101',16))
bram_b.write(4, int('0x03040100',16))
bram_b.write(8, int('0x01030503',16))
bram_b.write(12, int('0x02030609',16))
bram_b.write(8188,int('0x00000000',16))

In [37]:
#test initialization
print("a=")
for i in range(4):
    print(hex(bram_a.read(i*4)))
    
print("b=")
for i in range(4):
    print(hex(bram_b.read(i*4)))
    
print("c=")
for i in range(4):    
    print(hex(bram_c.read(i*4)))

a=
0x204
0x10200
0x3010402
0x1020004
b=
0x40005
0x4000400
0x5030102
0x3050103
c=
0x162a0624
0x1c140c12
0xf0d0708
0x120e0409


In [None]:
#just write 0 to register "start"
matmul.write(0x0,0) 

In [None]:
matmul.start()

In [None]:
#just write 0 to register "clear_done"
matmul.write(0x4,0)

In [None]:
#read registers from the IP. check initial/reset state
print(matmul.read(0x0)) #start 
print(matmul.read(0x4)) #done
print(matmul.read(0x8)) #addr_a
print(matmul.read(0xc)) #addr_b
print(matmul.read(0x10)) #addr_c
print(matmul.read(0x14)) #state of fsm
print(matmul.read(0x18)) #rdata_a
print(matmul.read(0x1C)) #rdata_b
print(matmul.read(0x20)) #rdata_c
print(hex(matmul.read(0x24))) #should be deadbeef

In [None]:
matmul.clear_done()

In [None]:
#read registers from the IP. check initial/reset state
print(matmul.read(0x0)) #start 
print(matmul.read(0x4)) #done
print(matmul.read(0x8)) #addr_a
print(matmul.read(0xc)) #addr_b
print(matmul.read(0x10)) #addr_c
print(matmul.read(0x14)) #state of fsm
print(matmul.read(0x18)) #rdata_a
print(matmul.read(0x1C)) #rdata_b
print(matmul.read(0x20)) #rdata_c
print(hex(matmul.read(0x24))) #should be deadbeef

In [None]:
matmul.is_done()

In [None]:
#read registers from the IP. check initial/reset state
print(matmul.read(0x0)) #start 
print(matmul.read(0x4)) #done
print(matmul.read(0x8)) #addr_a
print(matmul.read(0xc)) #addr_b
print(matmul.read(0x10)) #addr_c
print(matmul.read(0x14)) #state of fsm
print(matmul.read(0x18)) #rdata_a
print(matmul.read(0x1C)) #rdata_b
print(matmul.read(0x20)) #rdata_c
print(hex(matmul.read(0x24))) #should be deadbeef

In [None]:
#read bram
print("a=")
for i in range(4):
    print(hex(bram_a.read(i*4)))
    
print("b=")
for i in range(4):    
    print(hex(bram_b.read(i*4)))
    
print("c=")    
for i in range(4):    
    print(hex(bram_c.read(i*4)))

In [None]:
overlay.ip_dict

In [None]:
matmul.check_sanity()
matmul.reset()

In [None]:
matmul.start()

In [None]:
add_ip.write(0x8,2323)

In [None]:
add_ip.read(0x8)

In [None]:
a = AddDriver(overlay.scalar_add.description)
#overlay.scalar_add.add(4,5)

In [None]:
#Trying out DMA

In [1]:
import numpy as np
from pynq import allocate
from pynq import Overlay

overlay = Overlay('/home/aman/overlays/design_1.bit')
dma = overlay.axi_cdma_0

from pynq import MMIO
dma_mmio = MMIO(dma.mmio.base_addr, 0xffff)

In [2]:
input_buffer = allocate(shape=(5,), dtype=np.uint32)
output_buffer = allocate(shape=(5,), dtype=np.uint32)
for i in range(5):
    input_buffer[i] = i+10
print(input_buffer)
print(input_buffer.nbytes)
print(output_buffer)

[10 11 12 13 14]
20
[0 0 0 0 0]


In [5]:
input_buffer = allocate(shape=(4,4), dtype=np.uint32)
output_buffer = allocate(shape=(4,4), dtype=np.uint32)
for i in range(4):
    for j in range(4):
        input_buffer[i,j] = i+j+10
print(input_buffer)
print(input_buffer.nbytes)
print(output_buffer)        

[[10 11 12 13]
 [11 12 13 14]
 [12 13 14 15]
 [13 14 15 16]]
64
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [6]:
dma.register_map.CDMACR = 0x0004
print(dma.register_map.CDMASR)
dma.register_map.SA = input_buffer.device_address
#dma.register_map.DA = output_buffer.device_address #doesn't work because the register is weirdly defined as RO
dma_mmio.write(dma.register_map.DA.address, output_buffer.device_address) 
dma.register_map.BTT = input_buffer.nbytes

0x2


In [7]:
print(dma.register_map.CDMASR)
print(output_buffer)    

0x1002
[[10 11 12 13]
 [11 12 13 14]
 [12 13 14 15]
 [13 14 15 16]]


In [None]:
#That's it

In [8]:
dma.register_map.CDMACR = 0x0004
print(dma.register_map.CDMASR)
dma.register_map.SA = input_buffer.device_address
#dma.register_map.DA = output_buffer.device_address #doesn't work because the register is weirdly defined as RO
dma_mmio.write(dma.register_map.DA.address, 0x40000000)  #address of bram_a, from the POV of cdma
dma.register_map.BTT = input_buffer.nbytes

0x2


In [9]:
print(dma.register_map.CDMASR)

0x1002


In [11]:
bram_a = overlay.axi_bram_ctrl_a
print(hex(bram_a.mmio.base_addr))
print("a=")
for i in range(16):
    print(hex(bram_a.read(i*4)))

0x40000000
a=
0xa
0xb
0xc
0xd
0xb
0xc
0xd
0xe
0xc
0xd
0xe
0xf
0xd
0xe
0xf
0x10


In [14]:
bram_c = overlay.axi_bram_ctrl_c
for i in range(4):
    #bram_a.write(i*4,i+1000) 
    #bram_b.write(i*4,i+2000)
    bram_c.write(i*4,i+3000)
print("c=")
for i in range(4):
    print(hex(bram_c.read(i*4)))    

c=
0xbb8
0xbb9
0xbba
0xbbb


In [15]:
dma.register_map.CDMACR = 0x0004
print(dma.register_map.CDMASR)
dma.register_map.SA = 0x43000000
#dma.register_map.DA = output_buffer.device_address #doesn't work because the register is weirdly defined as RO
dma_mmio.write(dma.register_map.DA.address, output_buffer.device_address) 
dma.register_map.BTT = output_buffer.nbytes

0x2


In [16]:
print(dma.register_map.CDMASR)

0x1002


In [17]:
print(output_buffer)  

[[3000 3001 3002 3003]
 [   0    0    0    0]
 [   0    0    0    0]
 [   0    0    0    0]]


In [None]:
#dma?
dma.register_map
#dma.mmio.base_addr

In [None]:
print(dma.register_map.CDMASR[1])

In [None]:
def do_transfer(input_buffer, output_buffer):
    dma.register_map.CDMACR = 0x0004 #reset the DMA    
    dma.register_map.SA = input_buffer.device_address #set source address
    dma_mmio.write(dma.register_map.DA.address, output_buffer.device_address)  #set destination address
    dma.register_map.BTT = input_buffer.nbytes #set number of bytes to transfer and also trigger the DMA
    while (dma.register_map.CDMASR[1]==0): #loop until bit 1 (IDLE) is 0
        pass        

In [None]:
do_transfer(input_buffer, output_buffer)

In [None]:
#Direct writes to MMIO

In [None]:
dma_mmio.write(0x0, 0x04)

In [None]:
dma_mmio.read(0x4)

In [None]:
dma_mmio.write(0x18, input_buffer.device_address)

In [None]:
dma_mmio.write(0x20, output_buffer.device_address)

In [None]:
dma_mmio.write(0x28, 20)

In [None]:
#That's it

In [None]:
n = np.ndarray([4,4],np.uint8)

In [None]:
print(a)
print(a[:,0])
print(a[0,0])
print(a[1,0])
print(b)
print(b[0,:])

In [None]:
int((a[3,0]<<24) + (a[2,0]<<16) + (a[1,0]<<8) + (a[0,0]))

In [None]:
arr = [[8,4,6,8],[3,3,3,7],[5,2,1,6],[9,1,0,5]]
a = np.array(arr)
print(a)
arr = [[1,1,3,0],[0,1,4,3],[3,5,3,1],[9,6,3,2]]
b = np.array(arr)
print(b)
print(np.matmul(a,b))

In [None]:
bram_a.write_a(a)

In [None]:
bram_b.write_b(b)

In [None]:
bram_c.read_c()

In [None]:
c = do_matmul(a,b)
c

In [None]:
val = bram_c.read(0)
print(val)
print(hex(val))
print(hex((val & 0xff000000) >> 24))

In [None]:
res = np.dot(a,b)

In [None]:
res

In [None]:
np.matmul(a,b)

In [None]:
        #for x in np.nditer(a, order='F'):
        #    print(x)
        #extract each column
        #for x in np.nditer(a[:,0]):
        
        #for x in np.nditer(b, order='C'):
        #    print(x)