# Homework 08 - Implementation
### Scott Chase Waggener

# Problem 4

In [1]:
import tensorflow as tf
print('Requires Tensorflow 2.0 Alpha')
print('Tensorflow: %s' % tf.__version__)

Requires Tensorflow 2.0 Alpha
Tensorflow: 2.0.0-dev20190401


### Defining the Model

We can use high level APIs in Tensorflow to build one of several well known models.
Here we use MobileNetV2.

In [2]:
input_tensor = tf.keras.Input(shape=(512, 1024, 3), batch_size=1)

# Set weights=None or input_shape must match trained shape
# Set include_top=False to generate a headless model
model = tf.keras.applications.MobileNetV2(
    input_tensor=input_tensor,
    weights=None  
)
output_tensor = model(input_tensor)

We can then iterate over the model layers to retrieve shape and type information

In [3]:
FMT = "%-22s : %15s -> %-15s"
for layer in model.layers:
    name = type(layer).__name__
    inp= layer.input_shape 
    out= layer.output_shape
    msg = FMT % (name, inp, out)
    print(msg)

InputLayer             : [(1, 512, 1024, 3)] -> [(1, 512, 1024, 3)]
ZeroPadding2D          : (1, 512, 1024, 3) -> (1, 513, 1025, 3)
Conv2D                 : (1, 513, 1025, 3) -> (1, 256, 512, 32)
BatchNormalization     : (1, 256, 512, 32) -> (1, 256, 512, 32)
ReLU                   : (1, 256, 512, 32) -> (1, 256, 512, 32)
DepthwiseConv2D        : (1, 256, 512, 32) -> (1, 256, 512, 32)
BatchNormalization     : (1, 256, 512, 32) -> (1, 256, 512, 32)
ReLU                   : (1, 256, 512, 32) -> (1, 256, 512, 32)
Conv2D                 : (1, 256, 512, 32) -> (1, 256, 512, 16)
BatchNormalization     : (1, 256, 512, 16) -> (1, 256, 512, 16)
Conv2D                 : (1, 256, 512, 16) -> (1, 256, 512, 96)
BatchNormalization     : (1, 256, 512, 96) -> (1, 256, 512, 96)
ReLU                   : (1, 256, 512, 96) -> (1, 256, 512, 96)
ZeroPadding2D          : (1, 256, 512, 96) -> (1, 257, 513, 96)
DepthwiseConv2D        : (1, 257, 513, 96) -> (1, 128, 256, 96)
BatchNormalization     : (1, 128, 25

This takes us through the graph portion of the problem. 

### Hardware Parameters

Here we add the given hardware parameters directly as a `dict`

In [4]:
class Flags():
    def __init__(self, init):
        for k, v in init.items():
            if type(v) is dict and type(next(iter(v.keys()))) is not int:
                init[k] = Flags(v)
        self.__dict__ = init

hw = Flags({
    'ddr': {
        'bits': 64,
        'freq': 3200e6,
        'avail': 0.5,
        'eff': 0.8
    },
    'mem': {
        'off': 1e10,
        'on': 4e6
    },
    'comp': {
        'freq': 1e9,
        'mat': {
            8: (32, 32, 32),
            16: (32, 16, 16),
            32: (32, 8, 8)
        },
        'ticks_per_tile': 32,
        'vec': {
            8: 32,
            16: 16,
            32: 8
        },
        'ticks_per_vec': 1
    }
})

### Computation

Next we need functions to compute the various required metrics. Specifically,
we need functions that map Keras layers to their corresponding serial 
and parallel times

In [5]:
# Hardcode
precision = 16
mat_tile = hw.comp.mat[precision]
vec_tile = hw.comp.vec[precision]

def time(layer) -> float:
    name =type(layer).__name__
    if name in ['Conv2D', 'DepthwiseConv2D', 'Dense']:
        return matrix_time(layer)
    else:
        return vector_time(layer)


def matrix_time(layer) -> (float, float):
    name = type(layer).__name__

    # Set Fr, Fc for depthwise vs regular
    if name is 'Conv2D':
        Fr, Fc = (3, 3)
    else:
        Fr, Fc = (1, 1)

    # Set convolution dimensions
    Lr, Lc = layer.output_shape[1], layer.output_shape[2]
    No, Ni = layer.output_shape[-1], layer.input_shape[-1]
    M_conv = No 
    K_conv = Ni * Fr * Fc
    N_conv = Lr * Lc

    # Tile dimensions of matrix matrix multiply primitive
    M, N, K = mat_tile

    # Calculate required tiles as ceil of conv2d dims / primitive dims
    tiles = [
            (c // m + 1) 
            for c, m in zip([M_conv, N_conv, K_conv], [M, N, K])
    ]

    # Calculate compute time
    tile_product = 1
    for tile_count in tiles:
        tile_product *= tile_count 
    return tile_product * hw.comp.ticks_per_tile / hw.comp.freq * 1e6


def data_time(layer) -> float:

    # Weights all marked off device
    weight_time = precision / 8 * layer.count_params() / hw.ddr.freq

    if name in ['Add']:
        feature_size = precision / 8 * layer.input[0].shape.num_elements()
    elif name in ['InputLayer']:
        feature_size = precision / 8 * tf.TensorShape(layer.input.shape).num_elements()
    else:
        feature_size = precision / 8 * layer.input.shape.num_elements()

    if feature_size > hw.mem.on or name is 'InputLayer':
        feature_time = feature_size / hw.ddr.freq
    else:
        feature_time = 0

    return (weight_time + feature_time) * 1e6


def vector_time(layer) -> (float, float):

    # Add gives two input shapes so pull out one
    if name is 'Add':
        shape = layer.input_shape[0][1:]
    else:
        shape = layer.input_shape

    N = shape[0]
    return (N // vec_tile + 1) * hw.comp.ticks_per_tile / hw.comp.freq * 1e6

### Result

We can then loop over the layers and extract the relevant times

In [6]:
HEAD = "%-22s : %7s / %7s" % ('Layer', 'Comp(us)', 'Data(us)')
print(HEAD)
print('=' * len(HEAD))

FMT = "%-22s : %-8.2f / %8.2f"
comp_total = 0
data_total = 0
for layer in model.layers: #1: skips input layer
    name = type(layer).__name__

    if name in ['Conv2D', 'DepthwiseConv2D']:
        comp = matrix_time(layer)
    elif name in ['InputLayer']:
        comp = 0
    else:
        comp = vector_time(layer)

    data = data_time(layer)
    msg = FMT % (name, comp, data)
    print(msg)

    comp_total += comp
    data_total += data

print('=' * len(HEAD))
print(FMT % ('Total', comp_total, data_total))

Layer                  : Comp(us) / Data(us)
InputLayer             : 0.00     /   983.04
ZeroPadding2D          : 0.03     /     0.00
Conv2D                 : 1048.70  /     0.54
BatchNormalization     : 0.03     /  2621.52
ReLU                   : 0.03     /  2621.44
DepthwiseConv2D        : 1573.06  /  2621.62
BatchNormalization     : 0.03     /  2621.52
ReLU                   : 0.03     /  2621.44
Conv2D                 : 4981.34  /  2621.76
BatchNormalization     : 0.03     /  1310.76
Conv2D                 : 10487.04 /  1311.68
BatchNormalization     : 0.03     /  7864.56
ReLU                   : 0.03     /  7864.32
ZeroPadding2D          : 0.03     /  7864.32
DepthwiseConv2D        : 1835.90  /  7911.00
BatchNormalization     : 0.03     /  1966.32
ReLU                   : 0.03     /  1966.08
Conv2D                 : 3606.24  /  1967.52
BatchNormalization     : 0.03     /     0.06
Conv2D                 : 4589.76  /     2.16
BatchNormalization     : 0.03     /  2949.48
ReLU      

One can the trivially compute bounds in the serial and parallel case by considering
the sum of times for serial operations and the max of times for parallel operations.