In [1]:
from MatRaptorClasses import PE
from MatRaptorClasses import csr_to_c2sr
from MatRaptorClasses import SpAL
from MatRaptorClasses import SpBL
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import time
from threading import Thread
from threading import Event

In [2]:
NUM_CHANNELS = 8
NUM_QUEUES = 10
endFlag = True
#dramIntersector = DRAMIntersector(True)
#llbIntersector = LLBIntersector(True)
#peArray = PEArray(12)
#peIntersectorList = []
#for x in range(0,12):
#    peIntersectorList.append(PEIntersector(True, LLB_TILE_SIZE, PE_TILE_SIZE,x))
#dramIntersector.setNext(llbIntersector)
#llbIntersector.setNext(peArray)
#peArray.setNext(peIntersectorList)


gen = np.random.default_rng()
data1 = gen.integers(1,100,10000)
row1 = gen.integers(0,1000,10000)
col1 = gen.integers(0,1000,10000)

data2 = gen.integers(1,100,10000)
row2 = gen.integers(0,1000,10000)
col2 = gen.integers(0,1000,10000)
i1 = csr_matrix(coo_matrix((data1, (row1, col1)), shape=(1000, 1000)).toarray())
i2 = csr_matrix(coo_matrix((data2, (row2, col2)), shape=(1000, 1000)).toarray())

inputA = csr_to_c2sr(i1.data,i1.indices,i1.indptr,NUM_CHANNELS)
inputB = csr_to_c2sr(i2.data,i2.indices,i2.indptr,NUM_CHANNELS)
    
print(i1.toarray())
print(i2.toarray())



[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0 26  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
[[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [30]:
# Code going here: have each individual set of (SpAL, SpBL, PE) in a thread
# remember that the events are used ONLY to wait for other PEs to stop
# Best idea might be to create a very simple wrapper class for all three of them to increase clarity
# We can basically have a wrapper class with an SpAL, SpBL, PE, as well as a "running" method that is used in the multithreading, and cycles
# each part once every time its allowed!

class Wrapper:
    def __init__(self, SpAL, SpBL, PE) -> None:
        self.SpAL = SpAL
        self.SpBL = SpBL
        self.PE = PE
        self.endFlag = False
    
    
    def running(self, event):
        while not self.endFlag:
            time.sleep(0.0001)  
            if not event.is_set():
                self.PE.cycle()
                self.SpBL.cycle()
                self.SpAL.cycle()
                self.endFlag = self.PE.endFlag and self.SpAL.endFlag and self.SpBL.endFlag
                event.set()
        event.set()
        
WrapperArray = []
WrapperEventArray = []
for x in range(NUM_CHANNELS):
    A = SpAL(x, NUM_CHANNELS)
    B = SpBL(x)
    P = PE(10,x)
    
    A.setNext(B)
    B.setNext(P)
    
    A.loadMatrixA(inputA)
    B.loadMatrixB(inputB)
    
    W = Wrapper(A, B, P)
    WrapperArray.append(W)
    WrapperEventArray.append(Event())
    Thread(target=W.running,args=[WrapperEventArray[-1]]).start()


In [31]:
endFlag = False
cycleCount = 0
while not endFlag:
    endFlag = True # if any of the wrappers have a false EndFlag, this turns false (since we do boolean and with each of them)
    cycleCount += 1
    for x in range(NUM_CHANNELS):
        WrapperEventArray[x].clear()
    for x in range(NUM_CHANNELS):
        if not WrapperArray[x].endFlag:
            WrapperEventArray[x].wait()
        endFlag = endFlag and WrapperArray[x].endFlag # IF it goes through all the wrappers and they are all at EOF, then endFlag is True at the end of the cycle
     
for x in range(NUM_CHANNELS):
    print("PE" + str(x) + ", Wasted Cycles: " + str(WrapperArray[x].PE.numWastedCycles) + " , Part I Wasted Cycles: " + str(WrapperArray[x].PE.partIWastedCycles) + " , Part II Wasted Cycles: " + str(WrapperArray[x].PE.partIIWastedCycles))
print("total Cycles", cycleCount)

PE0, Wasted Cycles: 3 , Part I Wasted Cycles: 185 , Part II Wasted Cycles: 4194
PE1, Wasted Cycles: 3 , Part I Wasted Cycles: 145 , Part II Wasted Cycles: 3960
PE2, Wasted Cycles: 3 , Part I Wasted Cycles: 79 , Part II Wasted Cycles: 3851
PE3, Wasted Cycles: 3 , Part I Wasted Cycles: 99 , Part II Wasted Cycles: 3301
PE4, Wasted Cycles: 3 , Part I Wasted Cycles: 133 , Part II Wasted Cycles: 3937
PE5, Wasted Cycles: 3 , Part I Wasted Cycles: 104 , Part II Wasted Cycles: 3545
PE6, Wasted Cycles: 3 , Part I Wasted Cycles: 139 , Part II Wasted Cycles: 3501
PE7, Wasted Cycles: 3 , Part I Wasted Cycles: 92 , Part II Wasted Cycles: 3365
total Cycles 17084


In [25]:

r = []
c = []
v = []
for wrapper in WrapperArray:
    for o in wrapper.PE.outputBuffer:
        v.append(o[0])
        r.append(o[1])
        c.append(o[2])

est = coo_matrix((v,(r,c)),(1000,1000)).toarray()
actual = np.dot(i1.toarray(),i2.toarray())
print(np.equal(actual, est))
print(np.allclose(actual,est,0.0001,0.0001))



[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]]
True


Test handwritten example on PE

In [21]:
for wrapper in WrapperArray:
    print(wrapper.PE.outputBuffer)

deque([(4050, 16, 764), (9801, 32, 526), (5640, 32, 467), (2523, 40, 68), (7134, 40, 240), (2695, 48, 82), (3542, 48, 910), (1235, 64, 855), (224, 64, 195), (5980, 80, 916), (1040, 80, 986), (1775, 80, 696), (3458, 80, 548), (7189, 80, 760), (561, 88, 461), (1050, 88, 256), (1330, 96, 957), (1863, 104, 273), (2829, 104, 998), (1406, 120, 543), (1406, 120, 989), (3555, 120, 62), (1080, 120, 389), (1700, 128, 565), (104, 128, 370), (1656, 136, 618), (2328, 136, 636), (4510, 136, 259), (1320, 136, 553), (1140, 152, 445), (1617, 168, 502), (1232, 168, 960), (72, 192, 163), (646, 216, 680), (1280, 216, 324), (1060, 216, 813), (3735, 240, 307), (4814, 248, 367), (348, 248, 880), (3596, 256, 466), (525, 256, 231), (390, 256, 522), (915, 256, 689), (741, 256, 159), (3968, 272, 324), (3286, 272, 813), (5994, 280, 97), (1944, 280, 312), (4712, 288, 488), (4018, 296, 571), (7154, 296, 633), (6059, 296, 24), (2739, 296, 525), (2158, 296, 571), (3984, 296, 739), (7636, 296, 892), (7350, 320, 668), 

In [22]:
comp = PE(3,1)
comp.input(1,1,0,1,3)
comp.input(1,1,0,2,3)
comp.input(1,1,0,3,3)
comp.input(1,1,0,4,3)

comp.input(1,1,1,1,1)
comp.input(1,1,1,3,1)
comp.input(1,1,1,2,2)
comp.input(1,1,1,4,2)

comp.input(1,1,2,1,1)
comp.input(1,1,2,3,1)

comp.input(1,1,4,1,1)
comp.input(1,1,4,3,1)
comp.input(1,1,4,2,2)
comp.input(1,1,4,4,2)
comp.input(1,1,4,1,3)
comp.input(1,1,4,2,3)
comp.input(1,1,4,3,3)
comp.input(1,1,4,4,3)
comp.input(1,1,4,1,4)
comp.input(1,1,4,3,4)

comp.input(None,None,None,None,None)


In [23]:
for x in range(0,32): #Should finish running in 31 cycles
    print(x)
    comp.cycle()
    print(str(comp))

0
inputQueues: 0: deque([], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 outputQueues: 0: deque([], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 inputQueueLengths: 0: 0, 1: 0, 2: 0, 
 inputFlag: True
 inputBuffer: deque([(1, 1, 0, 1, 3), (1, 1, 0, 2, 3), (1, 1, 0, 3, 3), (1, 1, 0, 4, 3), (1, 1, 1, 1, 1), (1, 1, 1, 3, 1), (1, 1, 1, 2, 2), (1, 1, 1, 4, 2), (1, 1, 2, 1, 1), (1, 1, 2, 3, 1), (1, 1, 4, 1, 1), (1, 1, 4, 3, 1), (1, 1, 4, 2, 2), (1, 1, 4, 4, 2), (1, 1, 4, 1, 3), (1, 1, 4, 2, 3), (1, 1, 4, 3, 3), (1, 1, 4, 4, 3), (1, 1, 4, 1, 4), (1, 1, 4, 3, 4), (None, None, None, None, None)])
 prevI: -1
 currentI: -1
 currentQN: 1
 helperQN: 0
1
inputQueues: 0: deque([(1, 1)], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 outputQueues: 0: deque([], maxlen=20000), 1: deque([], maxlen=20000), 2: deque([], maxlen=20000), 
 inputQueueLengths: 0: 1, 1: 0, 2: 0, 
 inputFlag: True
 inputBuffer: deque([(1, 1, 0, 2, 3)

In [24]:
print(comp.outputBuffer)

deque([(1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 0, 4), (1, 1, 2), (1, 1, 4), (1, 1, 1), (1, 1, 3), (1, 2, 1), (1, 2, 3), (3, 4, 1), (2, 4, 2), (3, 4, 3), (2, 4, 4)])
