# Generating Architectures

In [1]:
pattern = """architecture:
  version: 0.4
  nodes:
  - !Container
    name: System
    attributes:
      technology: "40nm"
      global_cycle_seconds: 1e-9

  - !Component
    name: MainMemory
    class: DRAM
    attributes:
      width: 256
      datawidth: 8

  - !Component
    name: GlobalBuffer
    class: SRAM
    attributes:
      depth: {_globalBufferSize}
      width: 64
      datawidth: 8

  - !Container
    name: PE
    spatial: {{meshX: {_sizeX}, meshY: {_sizeY}}}

  - !Component
    name: RegisterFile
    class: regfile
    attributes:
      depth: {_regFileSize}
      width: 8
      datawidth: 8

  - !Component
    name: MACC
    class: intmac
    attributes:
      datawidth: 8"""

architectureNamePattern = "customArchitectures/{_sizeX}_{_sizeY}_{_globalBufferSize}_{_regFileSize}.yaml"

In [27]:
pattern = """architecture:
  # ============================================================
  # Architecture Description
  # ============================================================
  version: 0.4
  nodes: # Top-level is hierarchical
  - !Container # Top-level system
    name: system
    attributes:
      technology: "32nm"
      global_cycle_seconds: 1e-9
  
  - !Component # DRAM main memory
    name: DRAM
    class: DRAM
    attributes:
      type: "LPDDR4"
      width: 64
      datawidth: 8

  - !Container # Eyeriss accelerator
    name: eyeriss

  - !Component # Global buffer for inputs & outputs
    name: shared_glb
    class: smartbuffer_SRAM
    attributes:
      depth: {_globalBufferSize}
      width: 64
      n_banks: 32
      datawidth: 8
      read_bandwidth: 16
      write_bandwidth: 16
    constraints:
      dataspace: {{keep: [Inputs, Outputs], bypass: [Weights]}}

  - !Container # Each column of PEs produces a different psum row
    name: PE_column
    spatial: {{meshX: {_sizeX}}}
    constraints:
      spatial:
        permutation: [N, C, P, R, S, Q, M]
        factors: [N=1, C=1, P=1, R=1, S=1]
        split: 7

  - !Container # Each PE in the column receives a different filter row
    name: PE
    spatial: {{meshY: {_sizeY}}}
    constraints:
      spatial:
        split: 4
        permutation: [N, P, Q, R, S, C, M]
        factors: [N=1, P=1, Q=1, R=1]

  - !Parallel # Input/Output/Weight scratchpads in parallel
    nodes:
    - !Component # Input scratchpad
      name: ifmap_spad
      class: smartbuffer_RF
      attributes:
        depth: {_inputBufferSize}
        width: 16
        datawidth: 8
        read_bandwidth: 2
        write_bandwidth: 2
      constraints:
        dataspace: {{keep: [Inputs]}}
        temporal:
          permutation: [N, M, C, P, Q, R, S]
          factors: [N=1, M=1, C=1, P=1, Q=1, R=1, S=1]

    - !Component # Weight scratchpad
      name: weights_spad
      class: smartbuffer_RF
      attributes:
        depth: {_weightBufferSize}
        width: 16
        datawidth: 8
        read_bandwidth: 2
        write_bandwidth: 2
      constraints:
        dataspace: {{keep: [Weights]}}
        temporal:
          permutation: [N, M, P, Q, S, C, R]
          factors: [N=1, M=1, P=1, Q=1, S=1]

    - !Component # Output scratchpad
      name: psum_spad
      class: smartbuffer_RF
      attributes:
        depth: {_outputBufferSize}
        width: 16
        update_fifo_depth: 2
        datawidth: 16
        read_bandwidth: 2
        write_bandwidth: 2
      constraints:
        dataspace: {{keep: [Outputs]}}
        temporal:
          permutation: [N, C, P, Q, R, S, M] 
          factors: [N=1, C=1, R=1, S=1, P=1, Q=1]

  - !Component # MAC unit
    name: mac
    class: intmac
    attributes:
      multiplier_width: 8
      adder_width: 16"""

architectureNamePattern = "customArchitectures/{_sizeX}_{_sizeY}_{_globalBufferSize}_{_bufferSize}.yaml"

In [28]:
peSizes = [(4,64), (8,32), (16,16), (32,8), (64,4)]
globalBufferSizes = [1024, 2048, 4096, 8192, 16384, 32768]
bufferSizes = [2,4,8,16,24,32,64,128,192,256]

In [29]:
for (sizeX, sizeY) in peSizes:
    for globalBufferSize in globalBufferSizes:
        for bufferSize in bufferSizes:
            architecture = pattern.format(_sizeX = sizeX,
                                            _sizeY = sizeY,
                                            _globalBufferSize = globalBufferSize,
                                            _inputBufferSize = bufferSize,
                                            _weightBufferSize = bufferSize,
                                            _outputBufferSize = bufferSize)
            fileName = architectureNamePattern.format(_sizeX = sizeX,
                                            _sizeY = sizeY,
                                            _globalBufferSize = globalBufferSize,
                                            _bufferSize = bufferSize)
            f = open(fileName, "w")
            f.write(architecture)
            f.close()

# Calculating Results

In [31]:
import os
import timeloopfe.v4 as tl
from joblib import Parallel, delayed
from tqdm.auto import tqdm
#vgg16
#layers = ["00", "02", "04", "07", "10"]
#resnet18
#layers = ["00", "01", "06", "11", "16"]
#all
layers = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
#area
#layers = ["00"]

THIS_SCRIPT_DIR = os.getcwd()

print(THIS_SCRIPT_DIR)

architectureDirectory = os.path.join(THIS_SCRIPT_DIR, "customArchitectures5/")
outputDirectory = os.path.join(THIS_SCRIPT_DIR, "outputs5/")

#architectureDirectory = os.path.join(THIS_SCRIPT_DIR, "customArchitectures3Input/")

def run_mapper_with_spec(file, layer):
    filename = file[:-5]
    if not os.path.isdir(os.path.join(THIS_SCRIPT_DIR, outputDirectory + filename + "/" + layer)):
        spec = tl.Specification.from_yaml_files(
                os.path.join(architectureDirectory, file),
                os.path.join("components/*.yaml"),
                os.path.join(THIS_SCRIPT_DIR, "example_designs/layer_shapes/resnet18/" + layer + ".yaml"),
                os.path.join(THIS_SCRIPT_DIR, "mapper.yaml"),
            )
        tl.call_mapper(spec, output_dir=os.path.join(THIS_SCRIPT_DIR, outputDirectory + filename + "/" + layer))

filenames = os.listdir(architectureDirectory)
if ".ipynb_checkpoints" in filenames:
    filenames.remove(".ipynb_checkpoints")

unique_combinations = []
 
for i in range(len(filenames)):
    for j in range(len(layers)):
        unique_combinations.append((filenames[i], layers[j]))

Parallel(n_jobs=8)(
  delayed(run_mapper_with_spec)(file, layer) for file, layer in tqdm(unique_combinations)
)

/home/workspace


  0%|          | 0/126 [00:00<?, ?it/s]

input file: /home/workspace/outputs5/simpleCSMaxDim32x32/16/parsed-processed-input.yaml
input file: /home/workspace/outputs5/simpleCSMaxDim32x32/15/parsed-processed-input.yaml
input file: /home/workspace/outputs5/simpleCSMaxDim32x32/17/parsed-processed-input.yaml
input file: /home/workspace/outputs5/simpleCSMaxDim32x32/13/parsed-processed-input.yaml
input file: /home/workspace/outputs5/simpleCSMaxDim32x32/14/parsed-processed-input.yaml
input file: /home/workspace/outputs5/simpleCSMaxDim32x32/18/parsed-processed-input.yaml
input file: /home/workspace/outputs5/simpleCSMaxDim32x32/19/parsed-processed-input.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy /home/workspace/outputs5/simpleCSMaxDim32x32/17/parsed

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [148]:
def parseStatsFile(file):
    with open(file) as f:
        lines = [line.rstrip() for line in f]
    idx = lines.index("Summary Stats")
    
    utilizationStr = lines[idx+3]
    utilizationStr = utilizationStr[13:]
    utilizationStr = utilizationStr.strip('%')
    utilization = float(utilizationStr)/100

    cyclesStr = lines[idx+4]
    cyclesStr = cyclesStr[8:]
    cycles = int(cyclesStr)

    energyStr = lines[idx+5]
    energyStr = energyStr[8:]
    energyStr = energyStr[:-3]
    energy = float(energyStr)

    edpStr = lines[idx+6]
    edpStr = edpStr[14:]
    (edpValueStr, edpExponentStr) = edpStr.split("e")
    edpExponent = int(edpExponentStr)
    edpValue = float(edpValueStr)
    edp = edpValue * (10**edpExponent)
    
    return {"utilization": utilization, "cycles": cycles, "energy": energy, "edp": edp}

stats = parseStatsFile(os.path.join(THIS_SCRIPT_DIR, "outputs4/simpleOSMaxDim64x64/11/timeloop-mapper.stats.txt"))
print(stats)

{'utilization': 0.0479, 'cycles': 589824, 'energy': 254.95, 'edp': 150.0}


In [149]:
def calculateDelays(cyclesPerSecond, folder):
    architectureFolders = os.listdir(folder)
    if ".ipynb_checkpoints" in architectureFolders:
        architectureFolders.remove(".ipynb_checkpoints")
    delays = {}
    for architectureFolder in architectureFolders:
        layerFolders = os.listdir(folder + "/" + architectureFolder)
        if ".ipynb_checkpoints" in layerFolders:
            layerFolders.remove(".ipynb_checkpoints")
        layerDelays = []
        for layerFolder in layerFolders:
            stats = parseStatsFile(folder + "/" + architectureFolder + "/" + layerFolder + "/" + "timeloop-mapper.stats.txt")
            delay = stats["cycles"]/cyclesPerSecond
            layerDelays.append(delay)
        delays[architectureFolder] = layerDelays
    return delays

delays = calculateDelays(100000000, os.path.join(THIS_SCRIPT_DIR, "outputs5"))

del delays["simpleCSMaxDim"]
del delays["simpleCSMaxDim64x64"]
del delays["simpleOSMaxDim"]
del delays["simpleOSMaxDim64x64"]

print(delays)

{'simpleCSMaxDim32x32': [0.01229312, 0.00112896, 0.00112896, 0.00112896, 0.00112896, 0.00056448, 0.00112896, 6.272e-05, 0.00112896, 0.00112896, 0.00056448, 0.00112896, 6.272e-05, 0.00112896, 0.00112896, 0.00056448, 0.00112896, 6.272e-05, 0.00112896, 0.00112896, 5.12e-06], 'simpleOSMaxDim32x32': [0.00150528, 0.00147456, 0.00147456, 0.00147456, 0.00147456, 0.00073728, 0.00147456, 8.192e-05, 0.00147456, 0.00147456, 0.00294912, 0.00589824, 0.00032768, 0.00589824, 0.00589824, 0.01179648, 0.02359296, 0.00131072, 0.02359296, 0.02359296, 0.00512]}


In [242]:
def findArchSwitches(delays, currentArch, i):
    for j in range(i, len(delays[currentArch]) + 1):

        if j == len(delays[currentArch]):
            return [[]]
        
        listOfPartialLists = []
        for arch in delays:
            if not arch == currentArch:
                if delays[arch][j] < delays[currentArch][j]:
                    res = findArchSwitches(delays, arch, j+1)
                    for partialList in res:
                        partialList.insert(0, (arch, j))
                        listOfPartialLists.append(partialList)
                            
        if not listOfPartialLists == []:
            for partialList in findArchSwitches(delays, currentArch, j+1):
                listOfPartialLists.append(partialList)
            break
                
    return listOfPartialLists

def findBestTime(delays, reconfigTime):
    listDelays = []
    for startingArch in delays:        
        
        switchList = findArchSwitches(delays, startingArch, 0)

        #remove paths where architecture is switched before layer 0
        
        for l in switchList:
            for arch in delays:
                if (arch, 0) in l:
                    switchList.remove(l)
        

        for l in switchList:
            delaySum = 0
            currentArch = startingArch
            prevSwitch = 0
            for (arch, i) in l:
                for i in range(prevSwitch, i):
                    delaySum = delaySum + delays[currentArch][i]
                delaySum = delaySum + reconfigTime
                currentArch = arch
                prevSwitch = i + 1
            for i in range(prevSwitch, len(delays[currentArch])):
                    delaySum = delaySum + delays[currentArch][i]

            listDelays.append((delaySum, l, startingArch))

    print(listDelays)   
    print(len(listDelays))

    print("------------------")
    print(min(listDelays))
    print("------------------")

#findBestTime(delays, 0.02)

list_a = [1,  1,  5, 5,  10, 10]
list_b = [10, 10, 2, 2,  10, 10]
list_c = [10, 10, 10, 10, 3,  3]
reconfigTime = 4

testDelays = {}
testDelays["A"] = list_a
testDelays["B"] = list_b
testDelays["C"] = list_c

'''
list_d = [1, 2, 4, 1, 2, 4]
list_e = [2, 4, 1, 2, 4, 1]
list_f = [4, 1, 2, 4, 1, 2]

testDelays2 = {}
testDelays2["D"] = list_d
testDelays2["E"] = list_e
testDelays2["F"] = list_f
#3: 20 4: 46 5: 102 6: 225
'''

findBestTime(testDelays, reconfigTime)

[(20, [('B', 2), ('C', 4)], 'A'), (27, [('B', 2), ('C', 5)], 'A'), (30, [('B', 2)], 'A'), (23, [('B', 3), ('C', 4)], 'A'), (30, [('B', 3), ('C', 5)], 'A'), (33, [('B', 3)], 'A'), (22, [('C', 4)], 'A'), (29, [('C', 5)], 'A'), (32, [], 'A'), (30, [('A', 0), ('B', 2), ('C', 5)], 'B'), (26, [('A', 0), ('B', 3), ('C', 4)], 'B'), (36, [('A', 0), ('B', 3)], 'B'), (32, [('A', 0), ('C', 5)], 'B'), (33, [('A', 1), ('B', 2), ('C', 4)], 'B'), (40, [('A', 1), ('B', 2), ('C', 5)], 'B'), (43, [('A', 1), ('B', 2)], 'B'), (36, [('A', 1), ('B', 3), ('C', 4)], 'B'), (43, [('A', 1), ('B', 3), ('C', 5)], 'B'), (46, [('A', 1), ('B', 3)], 'B'), (35, [('A', 1), ('C', 4)], 'B'), (42, [('A', 1), ('C', 5)], 'B'), (45, [('A', 1)], 'B'), (34, [('C', 4)], 'B'), (41, [('C', 5)], 'B'), (44, [], 'B'), (30, [('A', 0), ('B', 2), ('C', 5)], 'C'), (26, [('A', 0), ('B', 3), ('C', 4)], 'C'), (36, [('A', 0), ('B', 3)], 'C'), (32, [('A', 0), ('C', 5)], 'C'), (33, [('A', 1), ('B', 2), ('C', 4)], 'C'), (40, [('A', 1), ('B', 2),