In [6]:
from io import StringIO
from scipy.io import mmread
from scipy.io import mminfo
from google.colab import drive
drive.mount("/content/drive")
datAddr = "/content/drive/MyDrive/CMSC818J_Test/"
from scipy.sparse import csr_matrix
from scipy import sparse
import numpy as np
import sys
import os
import csv

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Optimized Architecture 
 

## Assumptions
1. Each multiplication and accumulation takes 3 cycles
2. Reading the rows from memory takes 2 cycles
3. Storing the result of multiplication and accumulation takes 1 cycle. NO merging required since we are implementing inner product. 
4. PEs write back the output in parallel and can read the input in parallel
5. We will sort 8 rows at a time as our assumption is that given our bandwidth, we can read 8 rows at a time

In [None]:
# ENUMS for PE_stats
BUSY_CYCLES = 1 
MAX_NUM_NON_ZEROS = 2
MIN_NUM_NON_ZEROS = 3
IDLE_CYCLES = 4
FIFO_BUFFER_LENGTH = 8

total_PEs= 2 # number of PEs available. Can change this number as required
PE_stats = []
for i in range(total_PEs):
  PE_stat = dict()
  PE_stat[BUSY_CYCLES] = 0 
  PE_stat[MAX_NUM_NON_ZEROS] = 0 
  PE_stat[MIN_NUM_NON_ZEROS ] = sys.maxsize
  PE_stat[IDLE_CYCLES] = 0 
  PE_stats.append(PE_stat)
PE_stats

[{1: 0, 4: 0, 2: 0, 3: 9223372036854775807},
 {1: 0, 4: 0, 2: 0, 3: 9223372036854775807}]

In [9]:
# ENUMS for PE_stats
BUSY_CYCLES = 1 
MAX_NUM_NON_ZEROS = 2
MIN_NUM_NON_ZEROS = 3
IDLE_CYCLES = 4
FIFO_BUFFER_LENGTH = 16

CYCLES_TO_READ = 8
CYCLES_TO_MUL = 4 # per element 
CYCLES_TO_ACCUMULATE = 2 

In [3]:
def add_idle_cycles (cycles_num):
  for i in range(total_PEs):
    PE_stats[i][IDLE_CYCLES] += cycles_num 

In [4]:
# Simulates PE MAC operations
def PE_mult(output_col_num, TOTAL_NUMBER_OF_CYCLES, PE_num):
  PE_to_process = PE_lst.pop(0)
  row_num = PE_output_index.pop(0) 
  accumulation = 0
  # PE_stats[PE_num][NUMBER_OF_CYCLES] += 4  * len(PE_to_process)
  # optimization for inner product: whichever dictionary has smallest non-zeros, we iterate over that. 
  if len(B_dict) < len(PE_to_process):
    for col,data in B_dict.items():
      if col in PE_to_process:
        # multiply and then accumulate  
        accumulation += PE_to_process[col] * data 
        PE_stats[PE_num][BUSY_CYCLES] += CYCLES_TO_MUL 
  else:
    for col,data in PE_to_process.items():
      if col in B_dict:
        # multiply and then accumulate  
        accumulation += B_dict[col] * data 
        PE_stats[PE_num][BUSY_CYCLES] += CYCLES_TO_MUL 
        
  output_mat2[row_num][output_col_num] = accumulation
  # Update PE stats
  PE_stats[PE_num][IDLE_CYCLES] += CYCLES_TO_ACCUMULATE
    
  if len(PE_to_process) > PE_stats[PE_num][MAX_NUM_NON_ZEROS]: 
      PE_stats[PE_num][MAX_NUM_NON_ZEROS] = len(PE_to_process)
  elif len(PE_to_process) < PE_stats[PE_num][MIN_NUM_NON_ZEROS]: 
      PE_stats[PE_num][MIN_NUM_NON_ZEROS] = len(PE_to_process)
    
      



In [5]:
def sort_computer_rows(data_index_A, col_index_A, pointer_index_A, output_row_index, TOTAL_NUMBER_OF_CYCLES):
  PE_rows_read = dict() 
  PE_rows_index = dict()
  max_nonzero_per_PE = 0
  # Because of FIFO buffer length, there are lower number of cycles
  for i in range(FIFO_BUFFER_LENGTH):
    if pointer_index_A < mat_csrA_len:
      numOfElems = (mat_csr.indptr[pointer_index_A + 1] - mat_csr.indptr[pointer_index_A])
      data_row_A = mat_csr.data[data_index_A:numOfElems+data_index_A]
      row_A = mat_csr.indices[col_index_A:numOfElems+data_index_A]
      if numOfElems in PE_rows_read: 
        PE_rows_read[numOfElems].append(dict(zip(row_A, data_row_A)))
        PE_rows_index[numOfElems].append(output_row_index)
        data_index_A += numOfElems
        col_index_A += numOfElems
        pointer_index_A += 1
        output_row_index += 1
      else:
        PE_rows_read[numOfElems] = []
        PE_rows_read[numOfElems].append(dict(zip(row_A, data_row_A)))
        PE_rows_index[numOfElems] = []
        PE_rows_index[numOfElems].append(output_row_index)
        data_index_A += numOfElems
        col_index_A += numOfElems
        pointer_index_A += 1
        output_row_index += 1 
      if output_row_index % total_PEs != 0 :
        max_nonzero_per_PE = max(max_nonzero_per_PE, numOfElems)
      else:
        TOTAL_NUMBER_OF_CYCLES += max_nonzero_per_PE * CYCLES_TO_MUL
        TOTAL_NUMBER_OF_CYCLES += CYCLES_TO_ACCUMULATE # For accumulation 
        max_nonzero_per_PE = 0 
    else:
      break # if there are no more rows to be read  
  
  non_zero_keys = list(PE_rows_read.keys())
  non_zero_keys.sort(reverse = True)
  
  for k in non_zero_keys:
    for PE_rows in PE_rows_read[k]:
      PE_lst.append(PE_rows)
      PE_output_index.append(PE_rows_index[k].pop(0))
  
  return (data_index_A, col_index_A, pointer_index_A, output_row_index, TOTAL_NUMBER_OF_CYCLES)





In [None]:
directory = os.fsencode(datAddr)
for pe_num in range(2, 10, 2):
  with open('/content/drive/MyDrive/CMSC818J_Data/Optimized_PE' + str(pe_num) + '_16fifo.csv', 'w', newline='') as file:
    writer = csv.writer(file)   
    for file in os.listdir(directory):
      filename = os.fsdecode(file)
      print(filename)

      ## Read the file 
      m = mmread(datAddr+filename)
      mat_csr = m.tocsr()
      mat_csrB = m.tocsc()

      total_PEs= pe_num # number of PEs available. Can change this number as required
      PE_stats = []

      for i in range(total_PEs):
        PE_stat = dict()
        PE_stat[BUSY_CYCLES] = 0 
        PE_stat[MAX_NUM_NON_ZEROS] = 0 
        PE_stat[MIN_NUM_NON_ZEROS ] = sys.maxsize
        PE_stat[IDLE_CYCLES] = 0 
        PE_stats.append(PE_stat)
        PE_stats


      mat_csrB_len = len(mat_csrB.indptr) - 1
      mat_csrA_len = len(mat_csr.indptr) - 1
      output_mat_len = len(mat_csrB.indptr) - 1
      PE_num = 0
      # acts as the buffer that stores rows to be allocated to PEs
      PE_lst = []
      PE_output_index = []
      # matrix to store the file output 
      output_mat2 = [[0 for _ in range(output_mat_len)] for _ in range(output_mat_len)]
      data_index_B = 0 
      row_index_B = 0
      pointer_index_B = 0 
      output_col_index = 0
      output_row_index = 0

      # Total number of cycles is whichever PE took the longest 
      TOTAL_NUMBER_OF_CYCLES = 0 

      TOTAL_NUMBER_OF_CYCLES += CYCLES_TO_READ # For reading elements, one time cost rest of them are amortized with PE computations
      add_idle_cycles(CYCLES_TO_READ)


      for j in range(mat_csrB_len): 
        # Decode col of matrix B from csc format
        numOfElems_B = (mat_csrB.indptr[pointer_index_B + 1] - mat_csrB.indptr[pointer_index_B])
        data_row_B = mat_csrB.data[data_index_B:numOfElems_B+data_index_B]
        col_B = mat_csrB.indices[row_index_B:numOfElems_B+data_index_B]

        # The decoded cols and corresponding data values are zipped into a dictionary 
        # data structure to easily pattern match with rows when computing inner product
        B_dict = dict(zip(col_B, data_row_B))
        data_index_B += numOfElems_B
        row_index_B += numOfElems_B
        pointer_index_B += 1
        data_index_A = 0 # these values need to updated in if not PE
        col_index_A = 0
        pointer_index_A = 0
        output_row_index = 0
        for i in range(mat_csrA_len):
          # Decode rows of matrix A from csr format 
          if not PE_lst:
            data_index_A, col_index_A, pointer_index_A, output_row_index, TOTAL_NUMBER_OF_CYCLES = sort_computer_rows(data_index_A, col_index_A, pointer_index_A, output_row_index, TOTAL_NUMBER_OF_CYCLES)
            # allocate it to a PE using round-robin strategy 
          PE_mult(output_col_index, TOTAL_NUMBER_OF_CYCLES, PE_num)
          PE_num = (PE_num + 1) % total_PEs 
        output_col_index += 1 

      output_mat2
      output_mat2_reshape = np.reshape(output_mat2, (output_mat_len, output_mat_len))
      correct_output = np.dot(m.todense(),m.todense())
      # Validates if matrix multiplication is done correctly
      print(np.allclose(output_mat2_reshape, correct_output))
      max_cycles = []
      for pe in range(pe_num):
        max_cycles.append(PE_stats[pe][BUSY_CYCLES])
      max_num = max(max_cycles)

      string = [filename, TOTAL_NUMBER_OF_CYCLES]
      for pe in range(pe_num):
        for stat_num in range(1, 5, 1):
          if stat_num == IDLE_CYCLES:
            # assumption is that the rest of the PEs will wait for the main PE
            idle_cycles_to_add = max_num - PE_stats[pe][BUSY_CYCLES]
            string.append(PE_stats[pe][stat_num] + idle_cycles_to_add)
          else:
            string.append(PE_stats[pe][stat_num])
      writer.writerow(string)



