<a href="https://colab.research.google.com/github/Suraj-Sedai/hardware-aware-nn-inference/blob/main/notebooks/experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Environment Setup

In [17]:
import os
# Force single-threaded BLAS
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [18]:
import numpy as np
import time

In [19]:
hidden = 4096
batch = 1
layers = 3

# Input
x = np.random.randn(batch, hidden).astype(np.float32)

# Weights (preallocated)
W1 = np.random.randn(hidden, hidden).astype(np.float32)
W2 = np.random.randn(hidden, hidden).astype(np.float32)
W3 = np.random.randn(hidden, hidden).astype(np.float32)

Y1 = np.empty((batch, hidden), dtype=np.float32)
Y2 = np.empty((batch, hidden), dtype=np.float32)
Y3 = np.empty((batch, hidden), dtype=np.float32)

In [20]:
def forward(x):
    x = x @ W1
    x = np.maximum(x, 0)  # ReLU
    x = x @ W2
    x = np.maximum(x, 0)  # ReLU
    x = x @ W3
    return x

In [21]:
for _ in range(10):  # warmup
    y = forward(x)

In [22]:
iters = 100
start = time.perf_counter()
for _ in range(iters):
    y = forward(x)
end = time.perf_counter()

avg_latency = (end - start) / iters
print(f"Average latency per forward pass: {avg_latency*1000:.3f} ms")

Average latency per forward pass: 15.377 ms


In [23]:
flops_per_layer = 2 * batch * hidden * hidden
total_flops = flops_per_layer * 3

gflops_per_sec = (total_flops / avg_latency) / 1e9
print(f"Achieved GFLOPs/sec: {gflops_per_sec:.2f}")

Achieved GFLOPs/sec: 6.55


##Preallocate ALL Buffers

In [24]:
hidden = 4096
batch = 1

x = np.random.randn(batch, hidden).astype(np.float32)

W1 = np.random.randn(hidden, hidden).astype(np.float32)
W2 = np.random.randn(hidden, hidden).astype(np.float32)
W3 = np.random.randn(hidden, hidden).astype(np.float32)

# Preallocated activations
A1 = np.empty((batch, hidden), dtype=np.float32)
A2 = np.empty((batch, hidden), dtype=np.float32)
A3 = np.empty((batch, hidden), dtype=np.float32)

In [25]:
def forward_prealloc(x):
    np.matmul(x, W1, out=A1)
    np.maximum(A1, 0, out=A1)   # in-place ReLU

    np.matmul(A1, W2, out=A2)
    np.maximum(A2, 0, out=A2)

    np.matmul(A2, W3, out=A3)

    return A3

In [26]:
for _ in range(10):
    forward_prealloc(x)

In [27]:
iters = 100

start = time.perf_counter()
for _ in range(iters):
    forward_prealloc(x)
end = time.perf_counter()

avg_latency = (end - start) / iters
print(f"Average latency: {avg_latency*1000:.3f} ms")

Average latency: 15.106 ms


In [28]:
flops_per_layer = 2 * batch * hidden * hidden
total_flops = flops_per_layer * 3

gflops = (total_flops / avg_latency) / 1e9
print(f"Achieved GFLOPs/sec: {gflops:.2f}")

Achieved GFLOPs/sec: 6.66
