In [1]:
from sa import *
import os, json
import numpy as np

WEIGHT_FILE = "./qm.json"
TEST_INPUT = "./img0.npy"
RANDOM_MEM_GEN = False
DO_PRINT = False
GENERATE_ISA = False
with open(os.path.join(WEIGHT_FILE)) as fp:
    QM_DATA = json.loads(fp.read())
INPUT_DATA = np.array(np.load(os.path.join(TEST_INPUT))[0])

ADDR_MIN = 0x0000_0000
ADDR_MAX = 0x0002_0000
OFF_MEM_WB_BASE_ADDR = 0x0000_0000
OFF_MEM_UB_BASE_ADDR = 0x0001_AA80
UB_RESLUT_BASE_ADDR  = 32

FC1_SIZE = 0x0001_8800
FC2_SIZE = 0x0000_2000
FC3_SIZE = 0x0000_0280
UB_SIZE  = 0x0000_0310

FC1_OUTPUT_GOLDEN = np.load("./FC1_GOLDEN_VECTOR.npy")
FC2_OUTPUT_GOLDEN = np.load("./FC2_GOLDEN_VECTOR.npy")
FC3_OUTPUT_GOLDEN = np.load("./FC3_GOLDEN_VECTOR.npy")

Q = 5   # Q-Num format (Q-2.5)

In [2]:
# Functions for Compiler

# DATA/DATA : {'data', 'from', 'to', 'row', 'col'}
# ADDR : LSB=4(2**4=16B=1WORD)

def LOAD_DATA(SA:SYSTOLIC_ARRAY, OFF_MEM:BRAM, DATA: dict, ADDR: int):
    addr = ADDR # Start address
    for i in range(0, DATA['col'], 16):
        OFF_MEM.write(addr=addr, val=encode(np.flip(DATA['data'][i:i+16]), 16, 8))
        addr = addr + 1
    return addr # Return end address


def LOAD_WEIGHT(SA:SYSTOLIC_ARRAY, OFF_MEM:BRAM, WEIGHT: dict, ADDR: int) -> int:
    addr = ADDR # Start address
    for i in range(0, WEIGHT['col'], 16):
        for j in range(0, WEIGHT['row'], 16):
            tmp = WEIGHT['data'][j:j+16]
            tmp = tmp.transpose()[i:i+16]
            for k in range(16):
                OFF_MEM.write(addr=addr, val=encode(tmp[k].tolist(), 16, 8))
                addr = addr + 1
    return addr # Return end address


def PREPARE_FIFO(SA: SYSTOLIC_ARRAY, OFF_MEM: BRAM, WEIGHT_ADDR_FROM: int, WEIGHT_ADDR_TO:int, DATA_ADDR_FROM: int, DATA_ADDR_TO: int) -> list:
    # Write weight to WB
    for i in range(WEIGHT_ADDR_TO - WEIGHT_ADDR_FROM):
        SA.AXI_TO_WB_INST(OFF_MEM, i, WEIGHT_ADDR_FROM + i)

    # Write data to UB
    for i in range(DATA_ADDR_TO - DATA_ADDR_FROM):
        SA.AXI_TO_UB_INST(OFF_MEM, i, DATA_ADDR_FROM + i)

    # Fill WEIGHT FIFO
    WEIGHT_ADDR = WEIGHT_ADDR_FROM
    for i in range(4):
        SA.LOAD_WEIGHT(WEIGHT_ADDR)
        WEIGHT_ADDR = WEIGHT_ADDR + 1

    # Fill DATA FIFO
    DATA_ADDR = DATA_ADDR_FROM
    for i in range(4):
        SA.UB_TO_DATA_FIFO_INST(DATA_ADDR)
        DATA_ADDR = DATA_ADDR + 1

    return WEIGHT_ADDR, DATA_ADDR


def MATMUL_CALC(SA:SYSTOLIC_ARRAY, DATA: dict, WEIGHT: dict, DATA_ADDR: int, WEIGHT_ADDR: int, UB_ADDR: int):
    ACC_ADDR_FROM = 0
    ACC_ADDR_TO = 0
    for i in range(0, WEIGHT['col'], 16):
        ACC_ADDR = ACC_ADDR_FROM
        for j in range(0, WEIGHT['row'], 16):
            # Load weight
            for k in range(16):
                SA.LOAD_WEIGHT(WEIGHT_ADDR)
                WEIGHT_ADDR = WEIGHT_ADDR + 1
            # Calc
            if (i == 0):
                SA.MAT_MUL(ACC_ADDR)
            else:
                SA.MAT_MUL_ACC(ACC_ADDR)
            ACC_ADDR = ACC_ADDR + 1
        ACC_ADDR_TO = ACC_ADDR - 1
        SA.UB_TO_DATA_FIFO_INST(DATA_ADDR)
        DATA_ADDR = DATA_ADDR + 1
    
    UB_ADDR_FROM = UB_ADDR
    for i in range(ACC_ADDR_TO-ACC_ADDR_FROM+1):
        SA.WRITE_RESULT(UB_ADDR, ACC_ADDR_FROM + i)
        UB_ADDR = UB_ADDR + 1
    UB_ADDR_TO = UB_ADDR - 1
    
    return UB_ADDR_FROM, UB_ADDR_TO


def WRITE_RESULT_UB(SA:SYSTOLIC_ARRAY, OUTPUT_SIZE: int, ADDRA: int, ADDRB: int):
    for i in range(0, OUTPUT_SIZE, 16):
        SA.WRITE_RESULT(ADDRA + i, ADDRB + i)


def WRITE_RESULT_AXI(SA:SYSTOLIC_ARRAY, OFF_MEM:BRAM, ADDRA: int, ADDRB: int):
    pass


In [3]:
# Instantiation
SA = SYSTOLIC_ARRAY(gen_isa=GENERATE_ISA, USE_Q_NUMBER=True, Q=4)
OFF_MEM = BRAM(depth=8192, data_num=16, nbits=8)

In [4]:
# Prepare OFF-MEM
DATA_ADDR_FROM       = 0
DATA_ADDR_TO         = LOAD_DATA(SA, OFF_MEM, { 'data':INPUT_DATA, 'from':0, 'to':28*28, 'row':1, 'col':28*28 }, DATA_ADDR_FROM) - 1
FC1_WEIGHT_ADDR_FROM = DATA_ADDR_TO + 1
FC1_WEIGHT_ADDR_TO   = LOAD_WEIGHT(SA, OFF_MEM, { 'data':np.array(QM_DATA['weight']['FC1']), 'from':FC1_WEIGHT_ADDR_FROM, 'to':FC1_WEIGHT_ADDR_FROM+(128*768), 'row':128, 'col':768 }, FC1_WEIGHT_ADDR_FROM) - 1
FC2_WEIGHT_ADDR_FROM = FC1_WEIGHT_ADDR_TO + 1
FC2_WEIGHT_ADDR_TO   = LOAD_WEIGHT(SA, OFF_MEM, { 'data':np.array(QM_DATA['weight']['FC2']), 'from':FC2_WEIGHT_ADDR_FROM, 'to':FC2_WEIGHT_ADDR_FROM+(128*768), 'row':64, 'col':128 }, FC2_WEIGHT_ADDR_FROM) - 1
FC3_WEIGHT_ADDR_FROM = FC2_WEIGHT_ADDR_TO + 1
FC3_WEIGHT_ADDR_TO   = LOAD_WEIGHT(SA, OFF_MEM, { 'data':np.array(QM_DATA['weight']['FC1']), 'from':FC3_WEIGHT_ADDR_FROM, 'to':FC3_WEIGHT_ADDR_FROM+(128*768), 'row':10, 'col':64 }, FC3_WEIGHT_ADDR_FROM) - 1

# Write data to UB/WB
PREPARE_FIFO(SA=SA, OFF_MEM=OFF_MEM, WEIGHT_ADDR_FROM=FC1_WEIGHT_ADDR_FROM, WEIGHT_ADDR_TO=FC1_WEIGHT_ADDR_TO, DATA_ADDR_FROM=DATA_ADDR_FROM, DATA_ADDR_TO=DATA_ADDR_TO)
UB_ADDR_FROM, UB_ADDR_TO = MATMUL_CALC(SA=SA, DATA=None, WEIGHT={ 'data':np.array(QM_DATA['weight']['FC1']), 'from':FC1_WEIGHT_ADDR_FROM, 'to':FC1_WEIGHT_ADDR_FROM+(128*768), 'row':128, 'col':768 }, DATA_ADDR=DATA_ADDR_FROM, WEIGHT_ADDR=FC1_WEIGHT_ADDR_FROM, UB_ADDR=0)

print(f'DATA [{DATA_ADDR_FROM}:{DATA_ADDR_TO}]')
print(f'FC1 WEIGHT [{FC1_WEIGHT_ADDR_FROM}:{FC1_WEIGHT_ADDR_TO}]')
print(f'FC2 WEIGHT [{FC2_WEIGHT_ADDR_FROM}:{FC2_WEIGHT_ADDR_TO}]')
print(f'FC3 WEIGHT [{FC3_WEIGHT_ADDR_FROM}:{FC3_WEIGHT_ADDR_TO}]')

RES = []
for ub_addr in range(UB_ADDR_FROM, UB_ADDR_TO + 1):
    RES += decode(SA.UB.data[ub_addr], 16, 8)
#print(FC1_OUTPUT_GOLDEN)

[0, 2, 4, 1, 2, -4, 11, 10, 4, -5, 3, 0, -10, -2, 5, -2]
[-9, -3, 2, 0, 5, -1, 5, 2, -2, 1, 1, 5, -2, -4, 8, 2]
[-8, -1, -6, 2, -2, 6, -24, -8, -2, -3, -4, 4, 4, -2, 1, 2]
[1, 1, 2, 6, -6, 10, -12, 2, 2, -6, -1, 1, 1, -3, -29, 6]
[2, 17, 8, 1, -1, 2, -3, -2, 8, -1, 0, -6, -5, -4, -9, 13]
[9, 6, 0, 0, -9, 1, 6, 7, -3, 5, 7, 5, 3, -1, -5, 9]
[9, -1, -7, -7, -2, -2, 6, 4, -12, 3, -2, -3, -1, 1, -8, 4]
[-8, 2, -1, -5, 2, -5, -2, 4, 12, 6, -13, -2, 5, -15, -3, 3]
[-5, 0, 5, 2, 8, -6, 7, 6, 6, -1, 4, -1, -5, -8, 7, -3]
[-10, -6, -1, -1, 2, -5, 4, 1, -2, 3, 2, 9, 2, -1, 5, 2]
[-5, 0, -4, 2, -4, 5, -21, -4, 2, -2, -1, 5, 3, 2, 1, 4]
[2, 3, 4, 6, -4, 9, -14, 0, 5, -6, 3, -1, 3, -5, -28, 5]
[2, 17, 8, -1, -3, 1, 0, -4, 7, -2, 3, -5, -3, -8, -8, 12]
[3, 9, 2, 3, -12, 2, 2, 9, -5, 9, 9, 5, -2, -6, -1, 6]
[4, 2, -5, -1, 1, -2, -2, -2, -11, 4, -5, 0, -4, 3, -5, 3]
[-13, 1, -8, 0, 5, -5, 0, 3, 5, 5, -13, -1, -2, -15, -8, 1]
[2, 3, 6, 7, 5, -1, 5, 21, 9, -2, 1, -1, -5, -4, 6, 3]
[-6, -4, -4, 6, 7, -1,

In [9]:
print(RES)
print(FC1_OUTPUT_GOLDEN[0])


[-36, 71, 56, 47, -1, -2, -8, 125, -39, 37, 127, 54, -26, -35, -72, -48, 35, -46, -9, 4, -12, -7, 56, -11, -61, 70, 110, 127, -74, 32, -16, 17, -38, -22, 28, -19, -72, 39, -18, 19, 8, 43, 23, -32, -44, -45, -57, 35, 10, 55, -2, 44, 1, 46, -100, 63, 58, -90, -75, 46, -14, -26, 63, 58, 33, 127, 127, 48, 41, 2, -54, 71, 1, -19, -38, -25, -26, 21, -52, 100, 4, -128, 2, 29, -90, 46, -8, -93, -92, -7, -8, 39, -20, -31, 62, 6, 16, 40, -9, 40, 92, -61, -13, -38, -42, 89, 22, -89, -21, 26, -27, 110, 11, -12, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16]
[  35   73   39   68  -11   57    4   90  -68    4   42   98  -47 -128
  104   85  122   74   48   -2  127  -14  -29  -18  -40   35    3  -16
  -21    6   -4   65  -24  -26   15  113   32 -102   52    4  127   41
  -61   79  -81   31  -16   24  -33   80  108   44   75  -12  -14   63
   98  -75   57  -79  -36 -105  -38  -80  -17  -98  -79   15   98  -29
  112   -4  -89  124  -24  -47   58  -21  -28   14  -50   23  -60  -62

#Load weight

Prepare off-mem file(.coe)

In [6]:
# Weights
addr = 0

# FC1
FC1_weight = np.array(QM_DATA['weight']['FC1'])
print(FC1_weight.shape)
FC1_FROM = addr*16
print(f"FC1 FROM {FC1_FROM}[{tohex(FC1_FROM, 8)}]")
for j in range(0, 784, 16):
    for i in range(0, 128, 16):
        tmp2 = FC1_weight[i:i+16]
        tmp2 = (tmp2.transpose()[j:j+16]).transpose()
        for k in range(15, -1, -1):
            OFF_MEM.write(addr=addr, val=encode(data=tmp2[k], data_num=16, nbits=8))
            addr = addr + 1
FC1_TO = (addr-1)*16
print(f"FC1 TO {FC1_TO}[{tohex(FC1_TO, 8)}]")

# Input Data
INPUT_DATA_FROM = addr*16
print(f"INPUT_DATA FROM {INPUT_DATA_FROM}[{tohex(INPUT_DATA_FROM, 8)}]")
for i in range(0, 784, 16):
#for i in range(784-16, -1, -16):
    OFF_MEM.write(addr=addr, val=encode(data=np.flip(INPUT_DATA[i:i+16]), data_num=16, nbits=8))
    addr = addr + 1
INPUT_DATA_TO = addr*16
print(f"INPUT_DATA TO {INPUT_DATA_TO}[{tohex(INPUT_DATA_TO, 8)}]")



(128, 784)
FC1 FROM 0[00]
FC1 TO 100336[187f0]
INPUT_DATA FROM 100352[18800]
INPUT_DATA TO 101136[18b10]


Execute ISAs

In [7]:
# 1) Write weight to WB
WB_ADDR_TO = int(FC3_TO/16)
for addr in range(WB_ADDR_TO):
    SA.AXI_TO_WB_INST(OFF_MEM, addr, addr)

# 2) Write data to UB
UB_ADDR_TO = int((INPUT_DATA_TO - INPUT_DATA_FROM)/16)
for addr in range(UB_ADDR_TO):
    SA.AXI_TO_UB_INST(OFF_MEM, addr, addr+int(INPUT_DATA_FROM/16))



NameError: name 'FC3_TO' is not defined

In [None]:
# 1. Initial loading for FIFO
UB_ADDR = 0
WB_ADDR = 0

for i in range(4):
    SA.LOAD_WEIGHT(UB_ADDR)
    UB_ADDR = UB_ADDR + 1

print(SA.WEIGHT_FIFO_PRINT(dec=True))

for i in range(4):
    SA.UB_TO_DATA_FIFO_INST(WB_ADDR)
    WB_ADDR = WB_ADDR + 1

#print(SA.DATA_FIFO_PRINT(dec=True))
print(FC1_weight.transpose()[15][0:16])

0:[-1, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0]
1:[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]
2:[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0]
3:[0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1]
[-1, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1]
[ 0  0  0  0  1  0 -1 -1  0  0 -1  0  0  1  0 -1]


In [None]:
# 1. Fully Connected Layer 1
ACC_ADDR = 0

for j in range(0, 784, 16):
    for i in range(0, 128, 16):
        # Prepare Weights
        for k in range(16):
            SA.LOAD_WEIGHT(WB_ADDR)
            WB_ADDR = WB_ADDR + 1

        # Matrix Multiplication
        ACC_ADDR = int(i/16)
        if (i == 0):
            SA.MAT_MUL(addra=ACC_ADDR)
        else:
            SA.MAT_MUL_ACC(addra=ACC_ADDR)
    # Load new data
    SA.UB_TO_DATA_FIFO_INST(UB_ADDR)
    UB_ADDR = UB_ADDR + 1        

for i in range(8):
    SA.WRITE_RESULT(UB_RESLUT_BASE_ADDR+i, i)


a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1]
a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0]
a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]
a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[-1, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0]
a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[-1, -1, -1, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, 0]
a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[-1, 0, -1, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1]
a:[-16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16], w:[-1, -1, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, 0, 0]
a:[-16, -16, -16, -16, -1

In [None]:
FC1_out = list()
for i in range(8):
    FC1_out = FC1_out + decode(SA.UB.data[UB_RESLUT_BASE_ADDR+i], 16, 8)
FC1_out = np.array(FC1_out)
print(FC1_out)
print(FC1_OUTPUT_GOLDEN[0])

[   2    2    3    4    0    2    2   -1    4    3    5    1   -1    1
    3    2    3   57   18   -8   15   19   14   -3  -31   39  -17   12
   -1   33   26   32 -101  -32   28  -58  -14   56  -16  -48   46   44
    6  -20  -35    0  -35  -39   64   34  -11   -1  -12  -86  -50   78
   -4    6   -1   88   -5   23   -5   42   23   -2    2   47    7  -43
  -26  -21  -47    0  -35  -10   17  -22    6  -47   10   24   -5  -26
  -93  -61  -33  -91  -92 -111  -49  -15  118  103   62   40   30  -28
  -40  -51  -32   -7   25  -43  -11   64  119   30    2  -27  -45  -37
   45    4   27   65   54   -1   39   90   -2  -34  -39  -52 -128  -37
  -30  -16]
[  35   73   39   68  -11   57    4   90  -68    4   42   98  -47 -128
  104   85  122   74   48   -2  127  -14  -29  -18  -40   35    3  -16
  -21    6   -4   65  -24  -26   15  113   32 -102   52    4  127   41
  -61   79  -81   31  -16   24  -33   80  108   44   75  -12  -14   63
   98  -75   57  -79  -36 -105  -38  -80  -17  -98  -79   15   98

In [None]:
print(INPUT_DATA)
print(FC1_weight)
np.round(np.matmul(INPUT_DATA, FC1_weight.transpose())/16)

[-16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -12  11  16   5 -10 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -12   3  15  16  16  16  -5 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -12
  11  16  16  16  16  16  -5 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16   1  14  16  16  13   5  16  16  -5 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16   1
  15  16  16   2 -13  -3  16  16  -5 -16 -16 -16 -1

array([  35.,   72.,   39.,   68.,  -12.,   57.,    4.,   90.,  -69.,
          4.,   42.,   98.,  -48., -166.,  104.,   84.,  122.,   74.,
         48.,   -3.,  138.,  -15.,  -30.,  -19.,  -41.,   35.,    2.,
        -17.,  -22.,    6.,   -5.,   65.,  -25.,  -27.,   15.,  113.,
         32., -103.,   52.,    4.,  294.,   41.,  -62.,   79.,  -82.,
         31.,  -17.,   24.,  -34.,   80.,  108.,   44.,   75.,  -13.,
        -15.,   63.,   98.,  -76.,   57.,  -80.,  -37., -106.,  -39.,
        -81.,  -18.,  -99.,  -80.,   14.,   98.,  -30.,  112.,   -5.,
        -90.,  124.,  -25.,  -48.,   58.,  -22.,  -29.,   14.,  -51.,
         23.,  -61.,  -63.,  178.,   69., -137.,   -2.,   36.,  -70.,
        -27.,   16.,  -31.,  -80.,   29.,    7.,  -50.,  110.,   38.,
        117.,   75.,  102.,  -41.,  -42.,   10.,  -91.,  -30.,   77.,
        -84.,  -64., -313.,  -83.,   -6.,  156.,   13.,   -2.,   29.,
         70.,  -64.,   54.,   46.,  -50.,   -7.,   72.,  -21.,   54.,
        -28.,  -32.]