# Answers

## Section 2. Tensor Cores

### Import libraries and setup

In [None]:
import tensorflow as tf
from tensorflow import nn as nn
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# diltion, padding, and stride are not accounted for!
# They are assumed to be default values
def get_conv2d_MNK(in_tensor, filter, layout):
    if layout == "NHWC":
        batch_size, in_height, in_width, in_channels = in_tensor.shape

    if layout == "NCHW":
        batch_size, in_channels, in_height, in_width = in_tensor.shape

    filter_height, filter_width, in_channels, out_channels = filter.shape

    out_height = in_height - (filter_height - 1)
    out_width = in_width - (filter_width - 1)

    M = batch_size * out_height * out_width
    N = out_channels
    K = in_channels * filter_width * filter_height

    return (M, N, K)


def approximate_gflops(in_tensor, filters, layout, time):
    if time <= 0:
        raise ValueError("time must be greater than 0")
    M, N, K = get_conv2d_MNK(in_tensor, filters, layout)
    flop = 2*M*N*K
    flops = flop / time
    gflops = flops / 1000000000
    return gflops

### Problem 2

We will apply 2D convolution by this settings.

- Layout: NCHW, Batch size = 8
- Input tensor: height = width = 64, channel = 19
- Filter: 3x3
- Output tensor: channel = 131

**Task:** Increase the throughput of the convolution over 11500 GFLOPS by adjusting these values.
- Input channel should be in the range `[12, 30]`.
- Output channel should be in the range `[128, 192]`.
- Tensor layout should be `NHWC` or `NCHW`.

In [None]:
# Answer for Problem 2

### DO NOT CHANGE THESE VALUES ###
batch_size = 8
in_height = 64
in_width = 64
filter_height = 3
filter_width = 3

### CHANGE HERE! ###
in_channels = 16    # or can be 24
out_channels = 128  # or can be 136, 144, ..., 192 (multiple of 8)
layout = "NHWC"     # NHWC is faster than NCHW in tensor core computation
in_tensor = tf.random.uniform([batch_size, in_height, in_width, in_channels], dtype=tf.dtypes.float16)          # You should change the layout of input tensor
filters = tf.random.uniform([filter_height, filter_width, in_channels, out_channels], dtype=tf.dtypes.float16)

### DO NOT CHANGE BELOW ###
# Warming up
%timeit -q nn.conv2d(in_tensor, filters, 1, "VALID", data_format=layout)

# Profile!
elapsed_time = %timeit -o -q nn.conv2d(in_tensor, filters, 1, "VALID", data_format=layout)
print("out_channels %d: %f GFLOPS" % (out_channels, approximate_gflops(in_tensor, filters, layout, elapsed_time.average)))

### Plot for input channel 16, output channel 128 ~ 156

![Graph for Problem 2](graphs/ex2.png "Graph for Problem 2")