# load bitstream

In [22]:

from pynq import Overlay
import numpy as np
from pynq import allocate
import time

ol = Overlay('top_v2-7.bit')

# allocate buf

In [2]:

# u4: 32-bit unsigned integers
weight_buffer = allocate(shape=(15000000,), dtype='u4') # 60 MB
weight_buffer.device_address


(378535936,
 PynqBuffer([3, 0, 7, 0, 5, 4, 0, 7, 2, 0, 2, 2, 1, 7, 3, 1, 0, 5, 9, 1, 6,
             1, 0, 2, 0, 7], dtype=uint32))

In [3]:
# u4: 32-bit unsigned integers
unified_buffer = allocate(shape=(10000000,), dtype='u4') # 40 MB

unified_buffer[:256] = np.random.randint(10, size=256)
unified_buffer.flush()

unified_buffer.device_address, unified_buffer[:26]

(439353344,
 PynqBuffer([8, 0, 6, 7, 7, 5, 8, 1, 4, 5, 1, 6, 9, 8, 1, 5, 4, 9, 6, 5, 6,
             6, 1, 8, 7, 2], dtype=uint32))

#  test1

In [10]:
## transfer tpu program

binprog = [
            (0b00000001 << 56) | (0 << 24) | (16 << 8), # read ddr, start addr: 0, nburst: 16.
            (0b00111111 << 56)                          # end.
          ]

for i, inst in enumerate(binprog):
    print(f"{i}:", '0x{0:0{1}X}'.format(inst, 16))
    
binprog = np.array(binprog, dtype=np.uint64)

binprog = np.frombuffer(binprog, np.uint32)

np.copyto(ol.axi_bram_ctrl_0.mmio.array[:len(binprog)], binprog)


0: 0x0100000000001000
1: 0x3F00000000000000


In [None]:
## transfer input

weight_buffer[:256] = np.random.randint(10, size=256)
weight_buffer.flush()

np.sum(weight_buffer[:256])

In [11]:
## run

ol.qt3_tpu_v1_0.mmio.array[1] = np.uint32(weight_buffer.device_address) # DDR_BASEADDR_REG
time.sleep(0.2)

ol.qt3_tpu_v1_0.mmio.array[0] = np.uint32(1) # START_REG
time.sleep(0.2)

ol.qt3_tpu_v1_0.mmio.array[0] = np.uint32(0) # START_REG
time.sleep(0.2)

print("reg2: ", ol.qt3_tpu_v1_0.mmio.array[2])
for i in range(5, 10):
    print(f"reg{i}: ", ol.qt3_tpu_v1_0.mmio.array[i])

reg2:  1118
reg5:  0
reg6:  0
reg7:  0
reg8:  0
reg9:  0


# test2

In [23]:
## transfer tpu program
## program: read 16 64-bits data from ddr and compute 16 partial sums, one for each cycle.

# Note: burst length: 8, burst size: 64-bits.
binprog = [
            (0b00000001 << 56) | (0 << 24) | (2 << 8), # read ddr, start addr: 0, nburst: 2.
            (0b00111111 << 56)                          # end.
          ]

for i, inst in enumerate(binprog):
    print(f"{i}:", '0x{0:0{1}X}'.format(inst, 16))
    
binprog = np.array(binprog, dtype=np.uint64)
binprog = np.frombuffer(binprog, np.uint32)
np.copyto(ol.axi_bram_ctrl_0.mmio.array[:len(binprog)], binprog)


0: 0x0100000000000200
1: 0x3F00000000000000


In [24]:
## transfer input


# 32 == 2 (nburst) * 8 (burst length) * 2 (2 32-bit data)
weight_buffer[:32] = np.random.randint(10, size=32)
weight_buffer.flush()

np.sum(weight_buffer[:32])

PynqBuffer(128, dtype=uint32)

In [25]:
## run

ol.qt3_tpu_v1_0.mmio.array[1] = np.uint32(weight_buffer.device_address) # DDR_BASEADDR_REG
time.sleep(0.2)

ol.qt3_tpu_v1_0.mmio.array[0] = np.uint32(1) # START_REG
time.sleep(0.2)

ol.qt3_tpu_v1_0.mmio.array[0] = np.uint32(0) # START_REG
time.sleep(0.2)

print("reg2: ", ol.qt3_tpu_v1_0.mmio.array[2])

reg2:  128


In [26]:
## transfer tpu program
## program: write the 16 partial sums (currently in fifo) back to ddr.


# Note: burst length: 8, burst size: 64-bits.
binprog = [
            (0b00000010 << 56) | (0 << 24) | (2 << 8), # write ddr, start addr: 0, nburst: 2.
            (0b00111111 << 56)                          # end.
          ]

for i, inst in enumerate(binprog):
    print(f"{i}:", '0x{0:0{1}X}'.format(inst, 16))
    
binprog = np.array(binprog, dtype=np.uint64)
binprog = np.frombuffer(binprog, np.uint32)
np.copyto(ol.axi_bram_ctrl_0.mmio.array[:len(binprog)], binprog)

0: 0x0200000000000200
1: 0x3F00000000000000


In [27]:
## run

ol.qt3_tpu_v1_0.mmio.array[1] = np.uint32(unified_buffer.device_address) # DDR_BASEADDR_REG
time.sleep(0.2)

ol.qt3_tpu_v1_0.mmio.array[0] = np.uint32(1) # START_REG
time.sleep(0.2)

ol.qt3_tpu_v1_0.mmio.array[0] = np.uint32(0) # START_REG
time.sleep(0.2)



In [28]:
# unified_buffer[:32] = np.zeros(32)
unified_buffer[:32]

PynqBuffer([  0,   0,   6,   0,   8,   0,  12,   0,  23,   0,  36,   0,
             41,   0,  54,   0,  60,   0,  69,   0,  77,   0,  87,   0,
             99,   0, 104,   0, 109,   0, 123,   0], dtype=uint32)

In [29]:

a = np.zeros(32)
np.copyto(a, unified_buffer[:32])
a

array([  0.,   0.,   6.,   0.,   8.,   0.,  12.,   0.,  23.,   0.,  36.,
         0.,  41.,   0.,  54.,   0.,  60.,   0.,  69.,   0.,  77.,   0.,
        87.,   0.,  99.,   0., 104.,   0., 109.,   0., 123.,   0.])

In [30]:
a = np.array(a, dtype=np.uint32)
a = np.frombuffer(a, np.uint64)
a

array([  0,   6,   8,  12,  23,  36,  41,  54,  60,  69,  77,  87,  99,
       104, 109, 123], dtype=uint64)

In [34]:
for i in range(2,32,2):
    print(np.sum(weight_buffer[:i]))

6
8
12
23
36
41
54
60
69
77
87
99
104
109
123
