In [1]:
import pandas as pd
import numpy as np
from loaders import *

In [2]:
show_config('designs/zero_parallel/arch.yaml')

# Please do not modify this file. If there are double-curly-brace-enclosed
# statements, they are placeholders that should be set from the notebooks.
architecture:
  version: 0.4
  nodes:
  - !Container
    name: zero_parallel_arch
    attributes:
      # Top-level attributes inherited by all components unless overridden
      technology: "45nm"
      global_cycle_seconds: 1e-9
      datawidth: 16

  - !Component
    name: disk                 # disk is the source of all datatypes
    class: DRAM                # assume DRAM is large enough to store all the data, so no depth specification needed
    attributes:
      width: 64                # width in bits
      datawidth: datawidth 

  - !Component
    name: other_memories       # other_memories are the sources of weights not stored in self
    class: DRAM                # assume DRAM is large enough to store all the data, so no depth specification needed
    attributes:
      width: 64                # width in bits
      datawidth:

In [3]:
show_config('designs/zero_parallel/map.yaml')

# Please do not modify this file. If there are double-curly-brace-enclosed
# statements, they are placeholders that should be set from the notebooks.
mapping:
- target: disk
  type: temporal
  factors: 
  - P=1
  - Q=1
  - R=1
  - S=1
  - N={{disk_factor_N}}
  - M={{disk_factor_M}}
  - C={{disk_factor_C}}
  permutation: [S, R, Q, P, C, M, N] # don't change this

- target: other_memories
  type: dataspace
  keep: [Weights]
  bypass: [Inputs, Outputs]
- target: other_memories
  type: temporal
  factors: 
  - P=1
  - Q=1
  - R=1
  - S=1
  - N={{other_memories_factor_N}}
  - M={{other_memories_factor_M}}
  - C={{other_memories_factor_C}}
  permutation: [S, R, Q, P, C, M, N] # don't change this
  
- target: GPU
  type: spatial  # spatial constraint specification
  factors: 
  - P=1
  - Q=1
  - R=1
  - S=1
  - N={{GPU_spatial_factor_N}}
  - M={{GPU_spatial_factor_M}}
  - C={{GPU_spatial_factor_C}}
  permutation: [N, C, M, R, S, P, Q]
  # tells at which index should the dimensions be mapped t

In [4]:
ARCH_CONFIG = {
    'gpu_meshX': 16,
    'gpu_meshY': 1,
    'pe_meshX': 4, 
    'pe_meshY': 4
}

In [5]:
config = dict( # Do not change this configuration!
    disk_factor_N=1,
    disk_factor_M=1,
    disk_factor_C=1,
    other_memories_factor_N=1,
    other_memories_factor_M=1,
    other_memories_factor_C=1,
    GPU_spatial_factor_M=1,
    GPU_spatial_factor_C=1,
    GPU_spatial_factor_N=16,
    self_memory_factor_N=1,
    self_memory_factor_M=2,
    self_memory_factor_C=1,
    PE_spatial_factor_M=4,
    PE_spatial_factor_C=4,
    scratchpad_factor_N=1,
)

full_config = {
    **config,
    **ARCH_CONFIG
}

result = run_timeloop_model(
    full_config,
    architecture='designs/zero_parallel/arch.yaml',
    mapping='designs/zero_parallel/map.yaml',
    problem='layer_shapes/conv2.yaml'
)
stats = open('./output_dir/timeloop-model.stats.txt', 'r').read()
mapping = result.mapping

[INFO] 2025-04-14 03:45:31,713 - pytimeloop.accelergy_interface - Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


INFO:pytimeloop.accelergy_interface:Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


In [6]:
print(mapping)

disk [ Weights:800 (800) Inputs:65536 (65536) Outputs:100352 (100352) ] 
other_memories [ Weights:800 (800) ] 
inter_GPU_spatial [ ] 
---------------------
| for N in [0:16) (Spatial-X)

self_memory [ Weights:800 (800) Inputs:4096 (4096) Outputs:6272 (6272) ] 
------------------------------------------------------------------------
|   for M in [0:2)

inter_PE_spatial [ ] 
--------------------
|     for M in [0:4) (Spatial-Y)
|       for C in [0:4) (Spatial-X)

scratchpad [ Weights:25 (25) ] 
------------------------------
|         for R in [0:5)
|           for S in [0:5)
|             for P in [0:28)
|               for Q in [0:28)

weight_reg [ Weights:1 (1) ] 
input_activation_reg [ Inputs:1 (1) ] 
output_activation_reg [ Outputs:1 (1) ] 
---------------------------------------
|                 << Compute >>



In [7]:
print(stats)

Buffer and Arithmetic Levels
----------------------------
Level 0
-------
=== mac ===

    SPECS
    -----
    Word bits             : 16
    Instances             : 256 (64*4)
    Compute energy        : 3.27 pJ

    STATS
    -----
    Utilized instances      : 256
    Computes (total)        : 10035200
    Cycles                  : 39200
    Energy (total)          : 32865280.00 pJ
    Area (total)            : 441984.00 um^2

Level 1
-------
=== output_activation_reg ===

    SPECS
    -----
        Technology                      : SRAM
        Size                            : 1
        Word bits                       : 16
        Block size                      : 1
        Cluster size                    : 1
        Instances                       : 256 (64*4)
        Shared bandwidth                : -
        Read bandwidth                  : -
        Write bandwidth                 : -
        Multiple buffering              : 1.00
        Effective size                  : 1