In [6]:
from MatRaptorClasses import PE
from MatRaptorClasses import csr_to_c2sr
from MatRaptorClasses import SpAL
from MatRaptorClasses import SpBL
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import time
from threading import Thread
from threading import Event

In [7]:
NUM_CHANNELS = 5
NUM_QUEUES = 10
endFlag = True
#dramIntersector = DRAMIntersector(True)
#llbIntersector = LLBIntersector(True)
#peArray = PEArray(12)
#peIntersectorList = []
#for x in range(0,12):
#    peIntersectorList.append(PEIntersector(True, LLB_TILE_SIZE, PE_TILE_SIZE,x))
#dramIntersector.setNext(llbIntersector)
#llbIntersector.setNext(peArray)
#peArray.setNext(peIntersectorList)


gen = np.random.default_rng()
data1 = gen.integers(1,100,1000)
row1 = gen.integers(0,1000,1000)
col1 = gen.integers(0,1000,1000)

data2 = gen.integers(1,100,1000)
row2 = gen.integers(0,1000,1000)
col2 = gen.integers(0,1000,1000)
i1 = csr_matrix(coo_matrix((data1, (row1, col1)), shape=(1000, 1000)).toarray())
i2 = csr_matrix(coo_matrix((data2, (row2, col2)), shape=(1000, 1000)).toarray())

inputA = csr_to_c2sr(i1.data,i1.indices,i1.indptr,NUM_CHANNELS)
inputB = csr_to_c2sr(i2.data,i2.indices,i2.indptr,NUM_CHANNELS)
    
print(i1.toarray())
print(i2.toarray())



[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
# Code going here: have each individual set of (SpAL, SpBL, PE) in a thread
# remember that the events are used ONLY to wait for other PEs to stop
# Best idea might be to create a very simple wrapper class for all three of them to increase clarity
# We can basically have a wrapper class with an SpAL, SpBL, PE, as well as a "running" method that is used in the multithreading, and cycles
# each part once every time its allowed!

class Wrapper:
    def __init__(self, SpAL, SpBL, PE) -> None:
        self.SpAL = SpAL
        self.SpBL = SpBL
        self.PE = PE
        self.endFlag = False
    
    
    def running(self, event):
        while not self.endFlag:
            time.sleep(0.0001)  
            if not event.is_set():
                self.PE.cycle()
                self.SpBL.cycle()
                self.SpAL.cycle()
                self.endFlag = self.PE.endFlag and self.SpAL.endFlag and self.SpBL.endFlag
                event.set()
        event.set()
        
WrapperArray = []
WrapperEventArray = []
for x in range(NUM_CHANNELS):
    A = SpAL(x, NUM_CHANNELS)
    B = SpBL(x)
    P = PE(10,x)
    
    A.setNext(B)
    B.setNext(P)
    
    A.loadMatrixA(inputA)
    B.loadMatrixB(inputB)
    
    W = Wrapper(A, B, P)
    WrapperArray.append(W)
    WrapperEventArray.append(Event())
    Thread(target=W.running,args=[WrapperEventArray[-1]]).start()


In [9]:
endFlag = False
cycleCount = 0
while not endFlag:
    endFlag = True # if any of the wrappers have a false EndFlag, this turns false (since we do boolean and with each of them)
    cycleCount += 1
    for x in range(NUM_CHANNELS):
        WrapperEventArray[x].clear()
    for x in range(NUM_CHANNELS):
        if not WrapperArray[x].endFlag:
            WrapperEventArray[x].wait()
        endFlag = endFlag and WrapperArray[x].endFlag # IF it goes through all the wrappers and they are all at EOF, then endFlag is True at the end of the cycle
     
for x in range(NUM_CHANNELS):
    print("PE" + str(x) + ", Wasted Cycles: " + str(WrapperArray[x].PE.numWastedCycles) + " , Part I Wasted Cycles: " + str(WrapperArray[x].PE.partIWastedCycles) + " , Part II Wasted Cycles: " + str(WrapperArray[x].PE.partIIWastedCycles))
print("total Cycles", cycleCount)

PE0, Wasted Cycles: 26 , Part I Wasted Cycles: 3 , Part II Wasted Cycles: 85
PE1, Wasted Cycles: 52 , Part I Wasted Cycles: 2 , Part II Wasted Cycles: 89
PE2, Wasted Cycles: 27 , Part I Wasted Cycles: 6 , Part II Wasted Cycles: 109
PE3, Wasted Cycles: 41 , Part I Wasted Cycles: 2 , Part II Wasted Cycles: 90
PE4, Wasted Cycles: 32 , Part I Wasted Cycles: 4 , Part II Wasted Cycles: 88
total Cycles 473


In [15]:

r = []
c = []
v = []
for wrapper in WrapperArray:
    for o in wrapper.PE.outputBuffer:
        v.append(o[0])
        r.append(o[1])
        c.append(o[2])

est = coo_matrix((v,(r,c)),(1000,1000)).toarray()
actual = np.dot(i1.toarray(),i2.toarray())
print(np.equal(actual, est))
print(np.allclose(actual,est,0.0001,0.0001))



[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]]
True


Test handwritten example on PE

In [11]:
for wrapper in WrapperArray:
    print(wrapper.PE.outputBuffer)

deque([(1840, 20, 930), (7821, 30, 416), (4582, 30, 569), (5124, 35, 25), (1848, 35, 166), (6468, 35, 745), (6300, 35, 834), (517, 55, 121), (4136, 55, 940), (513, 75, 192), (630, 75, 895), (546, 85, 347), (21, 85, 636), (1591, 100, 872), (3276, 105, 692), (2574, 105, 708), (40, 110, 819), (4712, 125, 626), (2288, 125, 617), (434, 125, 141), (5704, 125, 713), (22, 130, 257), (594, 130, 124), (2475, 130, 722), (1950, 145, 251), (2340, 150, 900), (1170, 160, 300), (2795, 160, 306), (3705, 160, 444), (4290, 160, 960), (1276, 170, 13), (1508, 170, 349), (3290, 180, 468), (5076, 180, 874), (3525, 180, 185), (3675, 185, 67), (3900, 185, 746), (5250, 185, 857), (1746, 190, 103), (105, 240, 591), (12, 240, 688), (2133, 240, 366), (3100, 240, 143), (3162, 240, 981), (1950, 245, 358), (1508, 245, 380), (1404, 245, 544), (30, 250, 716), (1500, 250, 247), (4750, 250, 686), (150, 250, 797), (2862, 250, 199), (255, 250, 11), (435, 250, 419), (5376, 285, 251), (1755, 285, 47), (6745, 290, 468), (4136

In [12]:
comp = PE(3,1)
comp.input(1,1,0,1,3)
comp.input(1,1,0,2,3)
comp.input(1,1,0,3,3)
comp.input(1,1,0,4,3)

comp.input(1,1,1,1,1)
comp.input(1,1,1,3,1)
comp.input(1,1,1,2,2)
comp.input(1,1,1,4,2)

comp.input(1,1,2,1,1)
comp.input(1,1,2,3,1)

comp.input(1,1,4,1,1)
comp.input(1,1,4,3,1)
comp.input(1,1,4,2,2)
comp.input(1,1,4,4,2)
comp.input(1,1,4,1,3)
comp.input(1,1,4,2,3)
comp.input(1,1,4,3,3)
comp.input(1,1,4,4,3)
comp.input(1,1,4,1,4)
comp.input(1,1,4,3,4)

comp.input(None,None,None,None,None)


In [13]:
for x in range(0,32): #Should finish running in 31 cycles
    print(x)
    comp.cycle()
    print(str(comp))

0
inputQueues: 0: deque([], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 outputQueues: 0: deque([], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 inputQueueLengths: 0: 0, 1: 0, 2: 0, 
 inputFlag: True
 inputBuffer: deque([(1, 1, 0, 1, 3), (1, 1, 0, 2, 3), (1, 1, 0, 3, 3), (1, 1, 0, 4, 3), (1, 1, 1, 1, 1), (1, 1, 1, 3, 1), (1, 1, 1, 2, 2), (1, 1, 1, 4, 2), (1, 1, 2, 1, 1), (1, 1, 2, 3, 1), (1, 1, 4, 1, 1), (1, 1, 4, 3, 1), (1, 1, 4, 2, 2), (1, 1, 4, 4, 2), (1, 1, 4, 1, 3), (1, 1, 4, 2, 3), (1, 1, 4, 3, 3), (1, 1, 4, 4, 3), (1, 1, 4, 1, 4), (1, 1, 4, 3, 4), (None, None, None, None, None)])
 prevI: -1
 currentI: -1
 currentQN: 1
 helperQN: 0
1
inputQueues: 0: deque([(1, 1)], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 outputQueues: 0: deque([], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 inputQueueLengths: 0: 1, 1: 0, 2: 0, 
 inputFlag: True
 inputBuffer: deque([(1, 1, 0, 2, 3)

In [14]:
print(comp.outputBuffer)

deque([(1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 0, 4), (1, 1, 2), (1, 1, 4), (1, 1, 1), (1, 1, 3), (1, 2, 1), (1, 2, 3), (3, 4, 1), (2, 4, 2), (3, 4, 3), (2, 4, 4)])
