In [7]:
from pynq import DefaultIP
import numpy as np
from pynq import allocate
import time

class MatMulDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:user:matmul:1.0']
    
    def reset(self):
        matmul.write(0x0,0)
        matmul.write(0x4,0)

    def start(self):
        #trigger adder by writing 1 to "start" register
        matmul.write(0x0,1)     
    
    def is_done(self):
        #read the value in the "done" register
        return matmul.read(0x4)
    
    def clear_done(self):
        #write 0 to the "start" register to clear it
        matmul.write(0x0, 0)
        #write 1 to the "done" register to clear it
        matmul.write(0x4, 1)
        
    def current_state(self):
        return matmul.read(0x14)
    
    def check_sanity(self):
        return hex(matmul.read(0x24))

class BramDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:ip:axi_bram_ctrl:4.1']
    
    def write_a(self, a):
        for i in range(8):
            bram_a.write(i*8,  int((a[7,i]<<56) + (a[6,i]<<48) + (a[5,i]<<40) + (a[4,i]<<32) + \
                                   (a[3,i]<<24) + (a[2,i]<<16) + (a[1,i]<<8)  + (a[0,i])))
            
        #bram_a.write(0,  int((a[7,0]<<56) + (a[6,0]<<48) + (a[5,0]<<40) + (a[4,0]<<32) + \
        #                     (a[3,0]<<24) + (a[2,0]<<16) + (a[1,0]<<8)  + (a[0,0])))
        #bram_a.write(8,  int((a[7,1]<<56) + (a[6,1]<<48) + (a[5,1]<<40) + (a[4,1]<<32) + \
        #                     (a[3,1]<<24) + (a[2,1]<<16) + (a[1,1]<<8)  + (a[0,1])))
        #bram_a.write(16,  int((a[7,2]<<56) + (a[6,2]<<48) + (a[5,2]<<40) + (a[4,2]<<32) + \
        #                      (a[3,2]<<24) + (a[2,2]<<16) + (a[1,2]<<8) + (a[0,2])))
        #bram_a.write(24, int((a[7,3]<<56) + (a[6,3]<<48) + (a[5,3]<<40) + (a[4,3]<<32) + \
        #                     (a[3,3]<<24) + (a[2,3]<<16) + (a[1,3]<<8)  + (a[0,3])))
        
    def write_b(self, b):
        for i in range(8):
            bram_b.write(i*8,  int((a[i,7]<<56) + (a[i,6]<<48) + (a[i,5]<<40) + (a[i,4]<<32) + \
                                   (a[i,3]<<24) + (a[i,2]<<16) + (a[i,1]<<8)  + (a[i,0])))
       
        #bram_b.write(0,  int((b[0,3]<<24) + (b[0,2]<<16) + (b[0,1]<<8) + (b[0,0])))
        #bram_b.write(4,  int((b[1,3]<<24) + (b[1,2]<<16) + (b[1,1]<<8) + (b[1,0])))
        #bram_b.write(8,  int((b[2,3]<<24) + (b[2,2]<<16) + (b[2,1]<<8) + (b[2,0])))
        #bram_b.write(12, int((b[3,3]<<24) + (b[3,2]<<16) + (b[3,1]<<8) + (b[3,0])))
        
    def read_c(self):
        c = np.ndarray([8,8], dtype=np.uint8)
        for i in range(0,8):
            val = bram_c.read(8*i)
            c[i,0] = ((val & 0x00000000000000ff)>>0)
            c[i,1] = ((val & 0x000000000000ff00)>>8)
            c[i,2] = ((val & 0x0000000000ff0000)>>16)
            c[i,3] = ((val & 0x00000000ff000000)>>24)
            c[i,4] = ((val & 0x000000ff00000000)>>32)
            c[i,5] = ((val & 0x0000ff0000000000)>>40)
            c[i,6] = ((val & 0x00ff000000000000)>>48)
            c[i,7] = ((val & 0xff00000000000000)>>56)
        return c     

class CDMADriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:ip:axi_cdma:4.1']
    
    def reset(self):
        dma.register_map.CDMACR = 0x0004
    
    def do_transfer(self, src_addr, dst_addr, nbytes):
        dma.register_map.CDMACR = 0x0004 #reset the DMA    
        dma.register_map.SA = src_addr #set source address
        dma_mmio.write(dma.register_map.DA.address, dst_addr)  #set destination address
        dma.register_map.BTT = nbytes #set number of bytes to transfer and also trigger the DMA
        while (dma.register_map.CDMASR[1]==0): #loop until bit 1 (IDLE) is 0
            pass   
        
    def is_idle(self):
        return (dma.register_map.CDMASR[1]==1)

In [13]:
from pynq import Overlay
from pynq import MMIO
overlay = Overlay('/home/aman/overlays/design_1.bit')
matmul = overlay.matmul_0
bram_a = overlay.axi_bram_ctrl_a
bram_b = overlay.axi_bram_ctrl_b
bram_c = overlay.axi_bram_ctrl_c
dma = overlay.axi_cdma_0
dma_mmio = MMIO(dma.mmio.base_addr, 0xffff)

In [14]:
def do_matmul(a,b,c):    
    matmul.reset()
    dma.reset()
    #dma.do_transfer(a.device_address, bram_a.mmio.base_addr, a.nbytes) #sent to bram_a
    #dma.do_transfer(b.device_address, bram_b.mmio.base_addr, b.nbytes) #sent to bram_b
    dma.do_transfer(a.device_address, 0x40000000, a.nbytes) #sent to bram_a
    dma.do_transfer(b.device_address, 0x42000000, b.nbytes) #sent to bram_b
    #bram_a.write_a(a)
    #bram_b.write_b(b)    
    matmul.start()
    while not matmul.is_done():
        pass
    matmul.clear_done()
    #dma.do_transfer(bram_c.mmio.base_addr, c.device_address, c.nbytes) #bring from bram_b
    dma.do_transfer(0x43000000, c.device_address, c.nbytes) #bring from bram_b
    #c = bram_c.read_c()

In [15]:
import random
a = allocate(shape=(8,8), dtype=np.uint8)
b = allocate(shape=(8,8), dtype=np.uint8)
c = allocate(shape=(8,8), dtype=np.uint8)
for i in range(8):
    for j in range(8):
        a[i,j] = random.randint(0,9)
        b[i,j] = random.randint(0,9)


In [16]:
#print(np.transpose(a)) #Think of the transposed matrix as the actual input matrix. 
                       #In the BRAM, we are storing the non-transposed matrix, which
                       #basically means we are storing the actual input matrix in col major order
print(a)
print(a.nbytes)
print(b)   
print(b.nbytes)
print(c)   
print(c.nbytes)

#a = np.random.randint(low=0, high=5,size=(4,4), dtype=np.uint8)
#print("a=",a)
#b = np.random.randint(low=0, high=5,size=(4,4), dtype=np.uint8)
#print("b=",b)

[[6 8 5 0 5 2 5 4]
 [2 8 5 6 0 2 9 6]
 [1 1 1 4 1 1 5 8]
 [8 7 5 2 0 4 2 2]
 [2 0 6 3 0 8 8 7]
 [3 0 8 2 8 1 8 6]
 [2 9 2 0 6 1 1 9]
 [6 9 8 1 7 8 6 6]]
64
[[0 3 1 2 2 1 9 8]
 [1 2 2 5 8 7 3 3]
 [1 0 9 3 5 4 9 3]
 [3 2 1 9 4 5 4 7]
 [4 2 5 2 4 8 3 2]
 [6 0 4 3 2 2 9 7]
 [4 5 6 4 5 1 6 5]
 [4 0 7 0 3 2 3 9]]
64
[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]
64


In [17]:
print("Result from overlay running on fpga:")
start_time = time.time()
do_matmul(a,b,c)
end_time = time.time()
print(c)
print("Time taken = ", (end_time-start_time))

Result from overlay running on fpga:
[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]
Time taken =  0.03927731513977051


In [19]:
print("Result from numpy running on cpu:")
start_time = time.time()
c_cpu = np.matmul(np.transpose(a),b) #need to use transposed matrix, because that's the actual input matrix.
end_time = time.time()
print("c=", c_cpu)
print("Time taken = ", (end_time-start_time))

Result from numpy running on cpu:
c= [[230 146 198 233 203 137 255 133]
 [151  72 129 141 159 107 189  99]
 [ 93  53  52  63  66  62  79  45]
 [196 106 157 208 157 146 246 137]
 [199  71 111 146 132 140 188  65]
 [203 110 166 182 159 117 191  93]
 [200  41  93 149  98 170 194  54]
 [215  54 137 186 147 145 234  54]]
Time taken =  0.0012083053588867188


In [None]:
#That's it

In [None]:
np.transpose(a)

In [None]:
matmul = overlay.matmul_0

In [None]:
bram_a = overlay.axi_bram_ctrl_a
bram_b = overlay.axi_bram_ctrl_b
bram_c = overlay.axi_bram_ctrl_c

In [None]:
#initialize brams
for i in range(4):
    #bram_a.write(i*4,i+1000) 
    #bram_b.write(i*4,i+2000)
    bram_c.write(i*4,i+3000)
    
bram_a.write(0, int('0x09050308',16))
bram_a.write(4, int('0x01020304',16))
bram_a.write(8, int('0x00010306',16))
bram_a.write(12, int('0x05060708',16))
bram_a.write(8188,int('0x00000000',16))

bram_b.write(0, int('0x00030101',16))
bram_b.write(4, int('0x03040100',16))
bram_b.write(8, int('0x01030503',16))
bram_b.write(12, int('0x02030609',16))
bram_b.write(8188,int('0x00000000',16))

In [18]:
#test initialization
print("a=")
for i in range(16):
    print(hex(bram_a.read(i*4)))
    
print("b=")
for i in range(16):
    print(hex(bram_b.read(i*4)))
    
print("c=")
for i in range(16):    
    print(hex(bram_c.read(i*4)))

a=
0x1080906
0x4050205
0x50806
0x6090200
0x6050802
0x8050101
0x4010101
0x2020400
0x2050708
0x7080800
0x3060002
0x6080108
0x2080003
0x9010106
0x20902
0x6060807
b=
0x70004
0x8090102
0x2010300
0x3030708
0x5020201
0x3090405
0x3090001
0x7040504
0x9010203
0x2030804
0x2050204
0x7090202
0x3040006
0x5060105
0x4060504
0x9030203
c=
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0
0x0


In [None]:
#just write 0 to register "start"
matmul.write(0x0,0) 

In [None]:
matmul.start()

In [None]:
#just write 0 to register "clear_done"
matmul.write(0x4,0)

In [None]:
#read registers from the IP. check initial/reset state
print(matmul.read(0x0)) #start 
print(matmul.read(0x4)) #done
print(matmul.read(0x8)) #addr_a
print(matmul.read(0xc)) #addr_b
print(matmul.read(0x10)) #addr_c
print(matmul.read(0x14)) #state of fsm
print(matmul.read(0x18)) #rdata_a
print(matmul.read(0x1C)) #rdata_b
print(matmul.read(0x20)) #rdata_c
print(hex(matmul.read(0x24))) #should be deadbeef

In [None]:
matmul.clear_done()

In [None]:
#read registers from the IP. check initial/reset state
print(matmul.read(0x0)) #start 
print(matmul.read(0x4)) #done
print(matmul.read(0x8)) #addr_a
print(matmul.read(0xc)) #addr_b
print(matmul.read(0x10)) #addr_c
print(matmul.read(0x14)) #state of fsm
print(matmul.read(0x18)) #rdata_a
print(matmul.read(0x1C)) #rdata_b
print(matmul.read(0x20)) #rdata_c
print(hex(matmul.read(0x24))) #should be deadbeef

In [None]:
matmul.is_done()

In [None]:
#read registers from the IP. check initial/reset state
print(matmul.read(0x0)) #start 
print(matmul.read(0x4)) #done
print(matmul.read(0x8)) #addr_a
print(matmul.read(0xc)) #addr_b
print(matmul.read(0x10)) #addr_c
print(matmul.read(0x14)) #state of fsm
print(matmul.read(0x18)) #rdata_a
print(matmul.read(0x1C)) #rdata_b
print(matmul.read(0x20)) #rdata_c
print(hex(matmul.read(0x24))) #should be deadbeef

In [None]:
#read bram
print("a=")
for i in range(4):
    print(hex(bram_a.read(i*4)))
    
print("b=")
for i in range(4):    
    print(hex(bram_b.read(i*4)))
    
print("c=")    
for i in range(4):    
    print(hex(bram_c.read(i*4)))

In [None]:
overlay.ip_dict

In [None]:
matmul.check_sanity()
matmul.reset()

In [None]:
matmul.start()

In [None]:
add_ip.write(0x8,2323)

In [None]:
add_ip.read(0x8)

In [None]:
a = AddDriver(overlay.scalar_add.description)
#overlay.scalar_add.add(4,5)

In [None]:
#Trying out DMA

In [None]:
import numpy as np
from pynq import allocate
from pynq import Overlay

overlay = Overlay('/home/aman/overlays/design_1.bit')
dma = overlay.axi_cdma_0

from pynq import MMIO
dma_mmio = MMIO(dma.mmio.base_addr, 0xffff)

In [None]:
input_buffer = allocate(shape=(5,), dtype=np.uint32)
output_buffer = allocate(shape=(5,), dtype=np.uint32)
for i in range(5):
    input_buffer[i] = i+10
print(input_buffer)
print(input_buffer.nbytes)
print(output_buffer)

In [None]:
input_buffer = allocate(shape=(4,4), dtype=np.uint32)
output_buffer = allocate(shape=(4,4), dtype=np.uint32)
for i in range(4):
    for j in range(4):
        input_buffer[i,j] = i+j+10
print(input_buffer)
print(input_buffer.nbytes)
print(output_buffer)        

In [None]:
dma.register_map.CDMACR = 0x0004
print(dma.register_map.CDMASR)
dma.register_map.SA = input_buffer.device_address
#dma.register_map.DA = output_buffer.device_address #doesn't work because the register is weirdly defined as RO
dma_mmio.write(dma.register_map.DA.address, output_buffer.device_address) 
dma.register_map.BTT = input_buffer.nbytes

In [None]:
print(dma.register_map.CDMASR)
print(output_buffer)    

In [None]:
#That's it

In [None]:
dma.register_map.CDMACR = 0x0004
print(dma.register_map.CDMASR)
dma.register_map.SA = input_buffer.device_address
#dma.register_map.DA = output_buffer.device_address #doesn't work because the register is weirdly defined as RO
dma_mmio.write(dma.register_map.DA.address, 0x40000000)  #address of bram_a, from the POV of cdma
dma.register_map.BTT = input_buffer.nbytes

In [None]:
print(dma.register_map.CDMASR)

In [None]:
bram_a = overlay.axi_bram_ctrl_a
print(hex(bram_a.mmio.base_addr))
print("a=")
for i in range(16):
    print(hex(bram_a.read(i*4)))

In [None]:
bram_c = overlay.axi_bram_ctrl_c
for i in range(4):
    #bram_a.write(i*4,i+1000) 
    #bram_b.write(i*4,i+2000)
    bram_c.write(i*4,i+3000)
print("c=")
for i in range(4):
    print(hex(bram_c.read(i*4)))    

In [None]:
dma.register_map.CDMACR = 0x0004
print(dma.register_map.CDMASR)
dma.register_map.SA = 0x43000000
#dma.register_map.DA = output_buffer.device_address #doesn't work because the register is weirdly defined as RO
dma_mmio.write(dma.register_map.DA.address, output_buffer.device_address) 
dma.register_map.BTT = output_buffer.nbytes

In [None]:
print(dma.register_map.CDMASR)

In [None]:
print(output_buffer)  

In [None]:
#dma?
dma.register_map
#dma.mmio.base_addr

In [None]:
print(dma.register_map.CDMASR[1])

In [None]:
def do_transfer(input_buffer, output_buffer):
    dma.register_map.CDMACR = 0x0004 #reset the DMA    
    dma.register_map.SA = input_buffer.device_address #set source address
    dma_mmio.write(dma.register_map.DA.address, output_buffer.device_address)  #set destination address
    dma.register_map.BTT = input_buffer.nbytes #set number of bytes to transfer and also trigger the DMA
    while (dma.register_map.CDMASR[1]==0): #loop until bit 1 (IDLE) is 0
        pass        

In [None]:
do_transfer(input_buffer, output_buffer)

In [None]:
#Direct writes to MMIO

In [None]:
dma_mmio.write(0x0, 0x04)

In [None]:
dma_mmio.read(0x4)

In [None]:
dma_mmio.write(0x18, input_buffer.device_address)

In [None]:
dma_mmio.write(0x20, output_buffer.device_address)

In [None]:
dma_mmio.write(0x28, 20)

In [None]:
#That's it

In [None]:
n = np.ndarray([4,4],np.uint8)

In [None]:
print(a)
print(a[:,0])
print(a[0,0])
print(a[1,0])
print(b)
print(b[0,:])

In [None]:
int((a[3,0]<<24) + (a[2,0]<<16) + (a[1,0]<<8) + (a[0,0]))

In [None]:
arr = [[8,4,6,8],[3,3,3,7],[5,2,1,6],[9,1,0,5]]
a = np.array(arr)
print(a)
arr = [[1,1,3,0],[0,1,4,3],[3,5,3,1],[9,6,3,2]]
b = np.array(arr)
print(b)
print(np.matmul(a,b))

In [None]:
bram_a.write_a(a)

In [None]:
bram_b.write_b(b)

In [None]:
bram_c.read_c()

In [None]:
c = do_matmul(a,b)
c

In [None]:
val = bram_c.read(0)
print(val)
print(hex(val))
print(hex((val & 0xff000000) >> 24))

In [None]:
res = np.dot(a,b)

In [None]:
res

In [None]:
np.matmul(a,b)

In [None]:
        #for x in np.nditer(a, order='F'):
        #    print(x)
        #extract each column
        #for x in np.nditer(a[:,0]):
        
        #for x in np.nditer(b, order='C'):
        #    print(x)

In [None]:
#Testing difference between MMIO and DMA

In [None]:
a = allocate(shape=(32,32), dtype=np.uint32)
start_time = time.time()
dma.reset()
dma.do_transfer(a.device_address, bram_a.mmio.base_addr, a.nbytes) 
end_time = time.time()
print("Time taken = ", (end_time-start_time))

In [None]:
a = allocate(shape=(32,32), dtype=np.uint32)
start_time = time.time()
addr = 0
for i in range(32):
    for j in range(32):
        bram_a.write(addr, int(a[i][j]))
        addr += 4
end_time = time.time()
print("Time taken = ", (end_time-start_time))