In [1]:
import pandas as pd
import numpy as np
from loaders import *
from workloads import *

In [2]:
show_config('designs/data_parallel/arch.yaml')

# Please do not modify this file. If there are double-curly-brace-enclosed
# statements, they are placeholders that should be set from the notebooks.
architecture:
  version: 0.4
  nodes:
  - !Container
    name: data_parallel_arch
    attributes:
      # Top-level attributes inherited by all components unless overridden
      technology: "45nm"
      global_cycle_seconds: 1e-9
      datawidth: 16

  - !Component
    name: disk                 # disk is the source of all datatypes
    class: DRAM                
    attributes:
      width: 64                # width in bits
      datawidth: datawidth 
      depth: 999999

  - !Container
    name: GPU
    spatial: {meshX: {{gpu_meshX}}, meshY: {{gpu_meshY}}}
    
  - !Component
    name: self_memory
    class: SRAM
    attributes:
      width: 128
      depth: 999999
      datawidth: datawidth
      n_banks: 1
      n_rdwr_ports: 2

  - !Container
    name: PE
    spatial: {meshX: {{pe_meshX}}, meshY: {{pe_meshY}}}

  - !Component
    n

In [3]:
show_config('designs/data_parallel/map.yaml')

# Please do not modify this file. If there are double-curly-brace-enclosed
# statements, they are placeholders that should be set from the notebooks.
mapping:
- target: disk
  type: temporal
  factors: 
  - P=1
  - Q=1
  - R=1
  - S=1
  - N={{disk_factor_N}}
  - M={{disk_factor_M}}
  - C={{disk_factor_C}}
  permutation: [S, R, Q, P, C, M, N] # don't change this

- target: GPU
  type: spatial  # spatial constraint specification
  factors: 
  - P=1
  - Q=1
  - R=1
  - S=1
  - N={{GPU_spatial_factor_N}}
  - M={{GPU_spatial_factor_M}}
  - C={{GPU_spatial_factor_C}}
  permutation: [N, C, M, R, S, P, Q]
  # tells at which index should the dimensions be mapped to Y (GPU cols),
  # the dimensions before that index all should map to X (GPU rows)
  split: 1
  
- target: self_memory
  type: temporal
  factors: 
  - P=1
  - Q=1
  - R=1
  - S=1
  - N={{self_memory_factor_N}}
  - M={{self_memory_factor_M}}
  - C={{self_memory_factor_C}}
  permutation: [S, R, Q, P, C, M, N] # don't change this

- tar

In [4]:
ARCH_CONFIG = dict(
    gpu_meshX=1,
    gpu_meshY=1,
    pe_meshX=4, 
    pe_meshY=4
)

In [5]:
config = dict( # Do not change this configuration!
    disk_factor_N=1,
    disk_factor_M=1,
    disk_factor_C=1,
    other_memories_factor_N=1,
    other_memories_factor_M=1,
    other_memories_factor_C=1,
    GPU_spatial_factor_M=1,
    GPU_spatial_factor_C=1,
    GPU_spatial_factor_N=1,
    self_memory_factor_N=1,
    self_memory_factor_M=2,
    self_memory_factor_C=1,
    PE_spatial_factor_M=4,
    PE_spatial_factor_C=4,
    scratchpad_factor_N=1,
)

full_config = {
    **config,
    **ARCH_CONFIG,
    **conv2,
    'batch_size': 1, # overwrite conv2 batch_size
}

result = run_timeloop_model(
    full_config,
    architecture='designs/data_parallel/arch.yaml',
    mapping='designs/data_parallel/map.yaml',
    problem='layer_shapes/workload.yaml'
)
stats = open('./output_dir/timeloop-model.stats.txt', 'r').read()
mapping = result.mapping

[INFO] 2025-04-15 06:06:21,995 - pytimeloop.accelergy_interface - Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


INFO:pytimeloop.accelergy_interface:Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


In [6]:
print(mapping)

disk [ Weights:800 (800) Inputs:4096 (4096) Outputs:6272 (6272) ] 
self_memory [ Weights:800 (800) Inputs:4096 (4096) Outputs:6272 (6272) ] 
------------------------------------------------------------------------
| for M in [0:2)

inter_PE_spatial [ ] 
--------------------
|   for M in [0:4) (Spatial-Y)
|     for C in [0:4) (Spatial-X)

scratchpad [ Weights:25 (25) ] 
------------------------------
|       for R in [0:5)
|         for S in [0:5)
|           for P in [0:28)
|             for Q in [0:28)

weight_reg [ Weights:1 (1) ] 
input_activation_reg [ Inputs:1 (1) ] 
output_activation_reg [ Outputs:1 (1) ] 
---------------------------------------
|               << Compute >>



In [7]:
def get_energy(result, workload):
    access_energy_factor = 1e-9 # Change this
    return workload['batch_size'] * (result.energy - result.per_component_energy['disk']) \
        + (workload['batch_size'] - 1) * get_weight_size(workload) * access_energy_factor

In [8]:
def get_latency(result, workload):
    access_latency_factor = 1e-9 # Change this
    return result.latency \
        + ((workload['batch_size'] - 1) / workload['batch_size']) * get_weight_size(workload) * access_latency_factor

In [9]:
get_energy(result, conv2)

0.000584933953536

In [10]:
get_latency(result, conv2)

3.995e-05