##Environment Setup

In [10]:
import os
# Force single-threaded BLAS
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [11]:
import numpy as np
import time

In [12]:
hidden = 4096
batch = 1
layers = 3

# Input
x = np.random.randn(batch, hidden).astype(np.float32)

# Weights (preallocated)
W1 = np.random.randn(hidden, hidden).astype(np.float32)
W2 = np.random.randn(hidden, hidden).astype(np.float32)
W3 = np.random.randn(hidden, hidden).astype(np.float32)

Y1 = np.empty((batch, hidden), dtype=np.float32)
Y2 = np.empty((batch, hidden), dtype=np.float32)
Y3 = np.empty((batch, hidden), dtype=np.float32)

In [13]:
def forward(x):
    x = x @ W1
    x = np.maximum(x, 0)  # ReLU
    x = x @ W2
    x = np.maximum(x, 0)  # ReLU
    x = x @ W3
    return x

In [14]:
for _ in range(10):  # warmup
    y = forward(x)

In [15]:
iters = 100
start = time.perf_counter()
for _ in range(iters):
    y = forward(x)
end = time.perf_counter()

avg_latency = (end - start) / iters
print(f"Average latency per forward pass: {avg_latency*1000:.3f} ms")

Average latency per forward pass: 14.932 ms


In [16]:
flops_per_layer = 2 * batch * hidden * hidden
total_flops = flops_per_layer * 3

gflops_per_sec = (total_flops / avg_latency) / 1e9
print(f"Achieved GFLOPs/sec: {gflops_per_sec:.2f}")

Achieved GFLOPs/sec: 6.74
