In [1]:
import torch
import torch.nn as nn
import numpy as np
from scipy import signal
from pathlib import Path

In [38]:
from typing import Dict, Any


def save_var(path, name, npy_arrays: Dict[str, Any], it):
    """
    Helper function to save Numpy arrays to use for testing HLS components.
    path: location of folder where files are saved to
    name: appended to file name of every Numpy file
    npy_arrays: Dict of numpy arrays to save, key is used to name file, value is the Numpy array, not PyTorch tensor!.
    it: iteration number, appended to end of file name
    """
    for key, val in npy_arrays.items():
        with open(path / f"{name}_{key}_{it}.npy", "wb") as f:
            np.save(f, val)

        print(f"Saved variable {key} to {path}")


def pad_with(tensor, pad_width, iaxis, kwargs):
    """ "
    Helper function to pad matrices and tensors. Note this pads all dimensions, not just the H and W channels which
    are usually what we want in 2D convolution
    See https://numpy.org/doc/stable/reference/generated/numpy.pad.html
    """
    pad_value = kwargs.get("padder", 10)
    tensor[: pad_width[0]] = pad_value
    tensor[-pad_width[1] :] = pad_value

In [39]:
# ReLU testdata generation


# shape = (256, 128, 32, 32) to test HLS. Beware of stack overflow for big tensors, use std containers in C++!
shape = (4, 3, 2, 2) # Shape: [Batch size, input data dimensions]

x = torch.randn(*shape, dtype=torch.float, requires_grad=True)
relu = nn.ReLU()
out = relu(x)  # Forward pass

target = torch.randn(*shape)  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()  # Backward pass

# Validation
dout = 2 * (out - target) / np.prod(shape)  # derivative of MSELoss
cache = x.clone().detach()  # Must clone x so it isn't overwritten
cache[cache <= 0] = 0
dx = cache
dx[dx > 0] = 1
dx_truth = dout * dx
print(
    f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}"
)

# File name: shape_variable_testNum
test_folder = Path("../../mtt_components/relu/tests")
name = f"relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}"
it = 1
npy_arrays = {
    "x": x.detach().numpy(),
    "dx": x.grad.detach().numpy(),
    "out": out.detach().numpy(),
    "dout": dout.detach().numpy(),
}

save_var(test_folder, name, npy_arrays, it)

x gradients are approximately the same: True
Saved variable x to ../../mtt_components/relu/tests
Saved variable dx to ../../mtt_components/relu/tests
Saved variable out to ../../mtt_components/relu/tests
Saved variable dout to ../../mtt_components/relu/tests


In [40]:
# Affine/Linear layer testdata generation (random tensors)
# Remember to make dimensions divisible by block shape (usually 2^n)

in_shape = (8, 3, 2, 2)  # Shape: [Batch size, input data dimensions]
layer_shape = (
    np.prod(in_shape[1:]),
    4,
)  # Shape: [flattened input, output channel]

# Reshape to [batch, flattened input]
x = torch.randn(*in_shape, requires_grad=True).reshape(
    (in_shape[0], np.prod(in_shape[1:]))
)
x.retain_grad()  # Retain gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
out_truth = linear(
    x
)  # This output will also be our dout, since loss is just a sum for MSELoss

target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values
loss_fn = nn.MSELoss()
loss = loss_fn(out_truth, target)  # Compute loss
loss.backward()

dout_val = (
    2 * (out_truth - target) / np.prod((in_shape[0], layer_shape[1]))
)  # derivative of MSELoss
dx_val = dout_val @ linear.weight
dw_val = dout_val.T @ x
db_val = torch.sum(dout_val, axis=0)
out_val = x @ linear.weight.T + linear.bias
print(
    f"x gradients are approximately the same: {torch.allclose(dx_val, x.grad)}"
)
print(
    f"w gradients are approximately the same: {torch.allclose(dw_val, linear.weight.grad)}"
)
print(
    f"b gradients are approximately the same: {torch.allclose(db_val, linear.bias.grad)}"
)
print(
    f"out values are approximately the same: {torch.allclose(out_val, out_truth)}"
)


test_folder = Path("../../mtt_components/linear_forward//tests")
name = f"in_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_out_{layer_shape[1]}"
it = 1
npy_arrays = {
    "x": x.detach().numpy(),
    "dx": x.grad.detach().numpy(),
    "w": linear.weight.detach().numpy(),
    "dw": linear.weight.grad.detach().numpy(),
    "b": linear.bias.detach().numpy(),
    "db": linear.bias.grad.detach().numpy(),
    "out": out.detach().numpy(),
    "dout": dout_val.detach().numpy(),
}

save_var(test_folder, name, npy_arrays, it)

x gradients are approximately the same: True
w gradients are approximately the same: True
b gradients are approximately the same: True
out values are approximately the same: True
Saved variable x to ../../mtt_components/linear_forward/tests
Saved variable dx to ../../mtt_components/linear_forward/tests
Saved variable w to ../../mtt_components/linear_forward/tests
Saved variable dw to ../../mtt_components/linear_forward/tests
Saved variable b to ../../mtt_components/linear_forward/tests
Saved variable db to ../../mtt_components/linear_forward/tests
Saved variable out to ../../mtt_components/linear_forward/tests
Saved variable dout to ../../mtt_components/linear_forward/tests


In [41]:
# Cross Entropy loss function

in_shape = (4, 10)  # Shape [N, C] (C=logits, preactivation)

# PyTorch Ground Truth
out = torch.randn(
    in_shape, requires_grad=True
)  # Imaginary output of model, logits
target = torch.empty(in_shape[0], dtype=torch.long).random_(
    10
)  # Imaginary target values. Must be type torch.long for CE function

loss_fn = nn.CrossEntropyLoss()  # Loss function, default reduction is 'mean'
loss_truth = loss_fn(out, target)  # Compute loss
loss_truth.backward()  # Compute gradient
dout_truth = out.grad  # PyTorch dout

# Validation
y_out = out.detach().numpy()  # Logits are not whole numbers!
y_truth = target.detach().numpy().astype(np.int64)  # Equivalent to torch.long

# Forward pass
# Alternative Karpathy version: https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part4_backprop.ipynb
y_truth_one_hot = np.zeros_like(y_out, dtype=int)  # Same shape as model output
y_truth_one_hot[np.arange(y_out.shape[0]), y_truth] = (
    1  # Assign index of label as 1, others are 0
)
y_out_exp = np.exp(y_out - np.max(y_out, axis=1, keepdims=True))
y_out_probs = y_out_exp / np.sum(
    y_out_exp, axis=1, keepdims=True
)  # Cached for backwards
loss_val = -y_truth_one_hot * np.log(y_out_probs)
loss_val = loss_val.sum(axis=1).mean()

# Backwards pass
dout_val = y_out_probs
dout_val[np.arange(y_out.shape[0]), y_truth] -= 1
dout_val /= y_out.shape[0]  # Hand calculated dout to validate

print(
    f"Losses are approximately the same: {np.allclose(loss_val, loss_truth.detach().numpy())}"
)
print(
    f"out gradients are approximately the same: {np.allclose(dout_val, dout_truth.detach().numpy())}"
)

# File name: shape_variable_testNum
test_folder = Path("../../mtt_components/CE_forward/tests")
name = f"CE_in_{in_shape[0]}_{in_shape[1]}"
it = 1
npy_arrays = {
    "y_truth": y_truth.astype(np.float32),  # Convert to float for HLS
    "y_out": x.grad.detach().numpy(),
    "loss": loss_truth.detach().numpy(),
    "dout": dout_truth.detach().numpy(),
}

save_var(test_folder, name, npy_arrays, it)

Losses are approximately the same: True
out gradients are approximately the same: True
Saved variable y_truth to ../../mtt_components/CE_forward/tests
Saved variable y_out to ../../mtt_components/CE_forward/tests
Saved variable loss to ../../mtt_components/CE_forward/tests
Saved variable dout to ../../mtt_components/CE_forward/tests


In [42]:
# Convolution layer testdata generation
# Scipy functions used because similar libraries exist in HLS

# Parameters
padding = 1
stride = 1
kernel = 3
bias = True
N, Cin, Hin, Win = (7, 3, 5, 5)  # Input dimensions (N, Cin, Hin, Win)
Cout = 4  # Output dimensions (N, Cout, Hout, Wout)
Hout = int(1 + (Hin + 2 * padding - kernel) / stride)
Wout = int(1 + (Win + 2 * padding - kernel) / stride)

# PyTorch Ground Truth
x = torch.randn((N, Cin, Hin, Win), requires_grad=True)
conv = nn.Conv2d(
    Cin, Cout, kernel_size=kernel, stride=stride, padding=padding, bias=bias
)
out_truth = conv(x)
out_truth.retain_grad()  # Needed to retain gradient during autograd

# Validation

# Calculate loss through linear layer, then use CE
lin_out = 7  # Number of out channels, for testing
flatten = nn.Flatten()
flattened = flatten(out_truth)
linear = nn.Linear(Cout * Hout * Wout, lin_out)
lin_out = linear(flattened)
target = torch.randn(lin_out.shape)  # Imaginary target values
loss_fn = nn.CrossEntropyLoss()  # Loss function, default reduction is 'mean'
loss_truth = loss_fn(lin_out, target)  # Compute loss
loss_truth.backward()  # Compute gradient
dout_truth = (
    out_truth.grad.detach()
)  # Incoming gradient into conv layer, not linear!

# Useful numpy arrays
# x shape [N, Cin, Hin, Win]
# x padded shape [N, Cin, Hin + 2 * padding, Win + 2 * padding]
# dw shape [Cout, Cin, kernel, kernel]
# dout shape [N, Cout, Hout, Wout]
# bias shape [Cout]
dout_npy = dout_truth.detach().numpy()
weights = conv.weight.detach().numpy()
bias = conv.bias.detach().numpy()
dw_truth = conv.weight.grad.detach().numpy()
db_truth = conv.bias.grad.detach().numpy()
dx_truth = x.grad.detach().numpy()
x_padded = np.pad(x.detach().numpy(), padding, pad_with, padder=0)
x_padded = x_padded[
    padding:-padding, padding:-padding
]  # Strip padding on N and C dimensions

# Forward pass
out_val = np.zeros((N, Cout, Hout, Wout))
for n in range(N):
    for cout in range(Cout):  # Iterate over out kernels
        for cin in range(Cin):
            out_val[n, cout] += signal.correlate2d(
                x_padded[n, cin], weights[cout, cin], "valid"
            )
        out_val[n, cout] += bias[cout]

out_truth_npy = out_truth.detach().numpy()  # Convert to Numpy array
# Out values are sometimes very small, use a more relaxed relative tolerance (0.1% tolerance)
print(
    f"Convolution outputs are similar {np.allclose(out_truth_npy, out_val, rtol=1e-3, atol=1e-5)}"
)

# Backward pass

# Calculate dw and dx
dw_val = np.zeros(weights.shape)
dx_val = np.zeros(dx_truth.shape)
for n in range(N):
    for cout in range(Cout):
        for cin in range(Cin):
            dw_val[cout, cin] += signal.correlate2d(
                x_padded[n, cin], dout_npy[n, cout], "valid"
            )
            # Strip gradients calculated for x padding
            dx_val[n, cin] += signal.convolve2d(
                dout_npy[n, cout], weights[cout, cin], "full"
            )[padding:-padding, padding:-padding]

print(
    f"Gradient of weights are similar {np.allclose(dw_val, dw_truth, rtol=1e-3, atol=1e-5)}"
)
print(
    f"Gradient of x are similar {np.allclose(dx_val, dx_truth, rtol=1e-3, atol=1e-5)}"
)

# Calculate db
db_val = np.zeros(bias.shape)
for c in range(Cout):
    db_val[c] = np.sum(dout_npy[:, c, :, :])

print(
    f"Gradient of bias are similar {np.allclose(db_val, db_truth, rtol=1e-3, atol=1e-5)}"
)

Convolution outputs are similar True
Gradient of weights are similar True
Gradient of x are similar True
Gradient of bias are similar True


In [43]:
# Instance Norm (GroupNorm with groups=num of channels) testdata generation
# Remember behavior is different for training and evaluation!

(N, C, H, W) = (6, 3, 4, 4)

# PyTorch Ground Truth
x = torch.randn((N, C, H, W), requires_grad=True)

norm = nn.GroupNorm(C, C, affine=True) # Could use InstanceNorm layer, but MTT uses this approach
out_truth = norm(x)
out_truth.retain_grad()  # Needed to retain gradient during autograd

# Validation

# Calculate loss through linear layer, then use CE
lin_out = 7  # Number of out channels, for testing
flatten = nn.Flatten()
flattened = flatten(out_truth)
linear = nn.Linear(C * H * W, lin_out)
lin_out = linear(flattened)
target = torch.randn(lin_out.shape)  # Imaginary target values
loss_fn = nn.CrossEntropyLoss()  # Loss function, default reduction is 'mean'
loss_truth = loss_fn(lin_out, target)  # Compute loss
loss_truth.backward()  # Compute gradient
dout_truth = (
    out_truth.grad.detach()
)  # Incoming gradient into norm layer, not linear!

# Forward pass

eps = 0.00001 # use same as PyTorch default
gamma = torch.ones(C) # Number of features, i.e. RGB channels
gamma = gamma.view(1, -1, 1, 1) # Allow broadcasting to dimensions of x (1, num_features, 1, 1)
beta = torch.zeros(C) # Number of features, i.e. RGB channels
beta = beta.view(1, -1, 1, 1)
mean = x.mean(dim=(2,3), keepdim=True)
var = x.var(dim=(2,3), keepdim=True, unbiased=False)
x_normalized = (x - mean) / torch.sqrt(var + eps)
x_normalized = x_normalized * gamma + beta

print(f"Normalized output is same : {torch.allclose(out_truth, x_normalized, rtol=1e-3)}")

# Backward pass

dgamma = (dout_truth * x_normalized).sum(dim=(0, 2, 3), keepdim=True) # Gradient of weights
dbeta = dout_truth.sum(dim=(0, 2, 3), keepdim=True) # Gradient of bias
dx_normalized = dout_truth * gamma.view(1, -1, 1, 1)
inv_std = 1.0 / torch.sqrt(var + eps)
dvar = -0.5 * (dx_normalized * (x - mean) * inv_std ** 3).sum(dim=(2, 3), keepdim=True)
dmean = (-dx_normalized * inv_std).sum(dim=(2, 3), keepdim=True) + dvar * (-2.0 / (H * W) * (x - mean).sum(dim=(2, 3), keepdim=True))
dx_val = dx_normalized * inv_std + dvar * 2.0 * (x - mean) / (H * W) + dmean / (H * W)

print(f"Gradient of x is same: {torch.allclose(dx_val, x.grad, rtol=1e-3)}")
print(f"Gradient of weights is same: {torch.allclose(dgamma.view(-1), norm.weight.grad, rtol=1e-3)}")
print(f"Gradient of bias is same: {torch.allclose(dbeta.view(-1), norm.bias.grad, rtol=1e-3)}")

Normalized output is same : True
Gradient of x is same: True
Gradient of weights is same: True
Gradient of bias is same: True


In [None]:
# Implementation of Average Pooling