In [1]:
from io import StringIO
from scipy.io import mmread
from scipy.io import mminfo
from google.colab import drive
drive.mount("/content/drive")
datAddr = "/content/drive/MyDrive/CMSC818J_Test/"
from scipy.sparse import csr_matrix
from scipy import sparse
import numpy as np
import sys

Mounted at /content/drive


In [46]:
# Read the matrix in mtx format and convert it to csr format 
m = mmread(datAddr+'spaceShuttleEntry_3.mtx')
mat_csr = m.tocsr()
mat_csrB = m.tocsc()

## Creating a small example to test on

In [2]:
row_mat = 10
cols_mat= 10

mat1 = [[0 for _ in range(cols_mat)] for _ in range(row_mat)]

mat1[0][0], mat1[0][1], mat1[0][2], mat1[0][3] = 0,0,8,0
mat1[1][0], mat1[1][1], mat1[1][2], mat1[1][3] = 7,9,6,0
mat1[2][0], mat1[2][1], mat1[2][2], mat1[2][3] = 0,5,0,4
mat1[3][0], mat1[3][1], mat1[3][2], mat1[3][3] = 4,0,0,0
mat1[4][0], mat1[4][1], mat1[4][2], mat1[4][3] = 7,0,5,0
mat1[5][0], mat1[5][1], mat1[5][2], mat1[5][3] = 4,0,0,0
mat1[6][0], mat1[6][1], mat1[6][2], mat1[6][3] = 4,0,0,0
mat1[7][0], mat1[7][1], mat1[7][2], mat1[7][3] = 4,0,0,0
mat1[8][0], mat1[8][1], mat1[8][2], mat1[8][3] = 4,0,0,0
mat1[9][0], mat1[9][1], mat1[9][2], mat1[9][3] = 4,0,0,0
print(f'Matrix is {mat1}')
mat_csr = sparse.csr_matrix(mat1)
mat_csrB = sparse.csc_matrix(mat1)

Matrix is [[0, 0, 8, 0, 0, 0, 0, 0, 0, 0], [7, 9, 6, 0, 0, 0, 0, 0, 0, 0], [0, 5, 0, 4, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 5, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


## Optimized Architecture 
 

## Assumptions
1. Each multiplication and accumulation takes 3 cycles
2. Reading the rows from memory takes 2 cycles
3. Storing the result of multiplication and accumulation takes 1 cycle. NO merging required since we are implementing inner product. 
4. PEs write back the output in parallel and can read the input in parallel
5. We will sort 8 rows at a time as our assumption is that given our bandwidth, we can read 8 rows at a time

In [52]:
# ENUMS for PE_stats
NUMBER_OF_CYCLES = 1 
MAX_NUM_NON_ZEROS = 2
MIN_NUM_NON_ZEROS = 3

total_PEs= 2 # number of PEs available. Can change this number as required
PE_stats = []
for i in range(total_PEs):
  PE_stat = dict()
  PE_stat[NUMBER_OF_CYCLES] = 0 
  PE_stat[MAX_NUM_NON_ZEROS] = 0 
  PE_stat[MIN_NUM_NON_ZEROS ] = sys.maxsize
  PE_stats.append(PE_stat)
PE_stats

[{1: 0, 2: 0, 3: 9223372036854775807}, {1: 0, 2: 0, 3: 9223372036854775807}]

In [53]:
# Simulates PE MAC operations
def PE_mult(output_col_num):
  PE_to_process = PE_lst.pop(0)
  row_num = PE_output_index.pop(0) 
  accumulation = 0
  # optimization for inner product: whichever dictionary has smallest non-zeros, we iterate over that. 
  if len(B_dict) < len(PE_to_process):
    for col,data in B_dict.items():
      if col in PE_to_process:
        # multiply and then accumulate  
        accumulation += PE_to_process[col] * data 
        PE_stats[PE_num][NUMBER_OF_CYCLES] += 3 
  else:
    for col,data in PE_to_process.items():
      if col in B_dict:
        # multiply and then accumulate  
        accumulation += B_dict[col] * data 
        PE_stats[PE_num][NUMBER_OF_CYCLES] += 3 
        
  output_mat2[row_num][output_col_num] = accumulation
  # Update PE stats
  PE_stats[PE_num][NUMBER_OF_CYCLES] += 1
    
  if len(PE_to_process) > PE_stats[PE_num][MAX_NUM_NON_ZEROS]: 
      PE_stats[PE_num][MAX_NUM_NON_ZEROS] = len(PE_to_process)
  elif len(PE_to_process) < PE_stats[PE_num][MIN_NUM_NON_ZEROS]: 
      PE_stats[PE_num][MIN_NUM_NON_ZEROS] = len(PE_to_process)
    
      



In [54]:
# read rows and put them in dict. Then sort them using keys in dict, where the 
# keys are number of non-zero and values are correponding rws 

def read_rows(PE_rows_read, PE_rows_index):
  for i in range(8):
    if pointer_index_A < mat_csrA_len:
      print(pointer_index_A)
      numOfElems = (mat_csr.indptr[pointer_index_A + 1] - mat_csr.indptr[pointer_index_A])
      data_row_A = mat_csr.data[data_index_A:numOfElems+data_index_A]
      row_A = mat_csr.indices[col_index_A:numOfElems+data_index_A]
      if numOfElems in PE_rows_read: 
        PE_rows_read[numOfElems].append(dict(zip(row_A, data_row_A)))
        PE_rows_index[numOfElems].append(output_row_index)
        data_index_A += numOfElems
        col_index_A += numOfElems
        pointer_index_A += 1
        output_row_index += 1
      else:
        PE_rows_read[numOfElems] = []
        PE_rows_read[numOfElems].append(dict(zip(row_A, data_row_A)))
        PE_rows_index[numOfElems] = []
        PE_rows_index[numOfElems].append(output_row_index)
        data_index_A += numOfElems
        col_index_A += numOfElems
        pointer_index_A += 1
        output_row_index += 1

    else:
      break # if there are no more rows to be read 

def sort_computer_rows(data_index_A, col_index_A, pointer_index_A, output_row_index):
  PE_rows_read = dict() 
  PE_rows_index = dict()
  for i in range(8):
    if pointer_index_A < mat_csrA_len:
      numOfElems = (mat_csr.indptr[pointer_index_A + 1] - mat_csr.indptr[pointer_index_A])
      data_row_A = mat_csr.data[data_index_A:numOfElems+data_index_A]
      row_A = mat_csr.indices[col_index_A:numOfElems+data_index_A]
      if numOfElems in PE_rows_read: 
        PE_rows_read[numOfElems].append(dict(zip(row_A, data_row_A)))
        PE_rows_index[numOfElems].append(output_row_index)
        data_index_A += numOfElems
        col_index_A += numOfElems
        pointer_index_A += 1
        output_row_index += 1
      else:
        PE_rows_read[numOfElems] = []
        PE_rows_read[numOfElems].append(dict(zip(row_A, data_row_A)))
        PE_rows_index[numOfElems] = []
        PE_rows_index[numOfElems].append(output_row_index)
        data_index_A += numOfElems
        col_index_A += numOfElems
        pointer_index_A += 1
        output_row_index += 1

    else:
      break # if there are no more rows to be read  
  
  non_zero_keys = list(PE_rows_read.keys())
  non_zero_keys.sort(reverse = True)
  
  for k in non_zero_keys:
    for PE_rows in PE_rows_read[k]:
      PE_lst.append(PE_rows)
      PE_output_index.append(PE_rows_index[k].pop(0))
  
  return (data_index_A, col_index_A, pointer_index_A, output_row_index)





In [55]:
mat_csrB_len = len(mat_csrB.indptr) - 1
mat_csrA_len = len(mat_csr.indptr) - 1
output_mat_len = len(mat_csrB.indptr) - 1
PE_num = 0
# acts as the buffer that stores rows to be allocated to PEs
PE_lst = []
PE_output_index = []
# matrix to store the file output 
output_mat2 = [[0 for _ in range(output_mat_len)] for _ in range(output_mat_len)]
data_index_B = 0 
row_index_B = 0
pointer_index_B = 0 
output_col_index = 0
output_row_index = 0




for j in range(mat_csrB_len): 
  # Decode col of matrix B from csc format
  numOfElems_B = (mat_csrB.indptr[pointer_index_B + 1] - mat_csrB.indptr[pointer_index_B])
  data_row_B = mat_csrB.data[data_index_B:numOfElems_B+data_index_B]
  col_B = mat_csrB.indices[row_index_B:numOfElems_B+data_index_B]

  # The decoded cols and corresponding data values are zipped into a dictionary 
  # data structure to easily pattern match with rows when computing inner product
  B_dict = dict(zip(col_B, data_row_B))
  data_index_B += numOfElems_B
  row_index_B += numOfElems_B
  pointer_index_B += 1
  data_index_A = 0 # these values need to updated in if not PE
  col_index_A = 0
  pointer_index_A = 0
  output_row_index = 0
  for i in range(mat_csrA_len):
    # Decode rows of matrix A from csr format 
    if not PE_lst:
      data_index_A, col_index_A, pointer_index_A, output_row_index = sort_computer_rows(data_index_A, col_index_A, pointer_index_A, output_row_index)
      # allocate it to a PE using round-robin strategy 
    PE_mult(output_col_index)
    PE_num = (PE_num + 1) % total_PEs 
  output_col_index += 1 



In [56]:
output_mat2
output_mat2_reshape = np.reshape(output_mat2, (output_mat_len, output_mat_len))
correct_output = np.dot(m.todense(),m.todense())
# Validates if matrix multiplication is done correctly
print(np.allclose(output_mat2_reshape, correct_output))
# prints the number of cycles per PE, max and min number of zeros per PE
print(PE_stats)

True
[{1: 6769529, 2: 1700, 3: 4}, {1: 6626732, 2: 23, 3: 2}]


## Ideas


1. Need to read some rows together (8 for now based on our memory bandwidth)
2. Each of those rows read together should be inserted in sorted order in the PE_lst. 


