In [2]:
import torch
import torch.nn as nn
import numpy as np
from pathlib import Path

In [2]:
"""
TODO:
- MSEloss for all layer tests
- Proper data generation method (multiple datasets needed to test everything)
- Test for ints and big float as well

"""

'\nTODO:\n- MSEloss for all layer tests\n- Proper data generation method (multiple datasets needed to test everything)\n- Test for ints and big float as well\n\n'

In [3]:
# Taken from I2DL exercises, need to cite!

# def eval_numerical_gradient_array(f, x, df, h=1e-5):
#     """
#     Evaluate a numeric gradient for a function that accepts a numpy
#     array and returns a numpy array.
#     """
#     grad = np.zeros_like(x)
#     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
#     while not it.finished:
#         ix = it.multi_index

#         oldval = x[ix]
#         x[ix] = oldval + h
#         pos = f(x).copy()
#         x[ix] = oldval - h
#         neg = f(x).copy()
#         x[ix] = oldval

#         grad[ix] = np.sum((pos - neg) * df) / (2 * h)
#         it.iternext()
#     return grad

In [3]:
from typing import Dict, Any

def save_var(path, name, npy_arrays: Dict[str, Any], it):
    """
    Helper function to save Numpy arrays to use for testing HLS components.
    path: location of folder where files are saved to
    name: appended to file name of every Numpy file
    npy_arrays: Dict of numpy arrays to save, key is used to name file, value is the Numpy array, not PyTorch tensor!.
    it: iteration number, appended to end of file name
    """
    for key, val in npy_arrays.items():
        with open(path / f"{name}_{key}_{it}.npy", "wb") as f:
            np.save(f, val)

        print(f"Saved variable {key} to {path}")

# See https://numpy.org/doc/stable/reference/generated/numpy.pad.html
# Only pads the Channels, does not add batches
def pad_with(tensor, pad_width, iaxis, kwargs):
    pad_value = kwargs.get('padder', 10)
    # print(pad_width)
    tensor[:pad_width[0]] = pad_value
    tensor[-pad_width[1]:] = pad_value

# def pad_with(tensor, pad_width, iaxis, kwargs):
#     pad_value = kwargs.get('padder', 10)
#     vector[:pad_width[0]] = pad_value
#     vector[-pad_width[1]:] = pad_value


In [18]:
# ReLU testdata generation

# First val is batch size, rest are dimensions of data.
# Numbers are based on input shape of MTT ReLU layers
# shape = (256, 128, 32, 32) # Beware of stack overflow for big tensors, used vectors in C++!
shape = (4, 3, 2, 2)

x = torch.randn(*shape, dtype=torch.float, requires_grad=True)
relu = nn.ReLU()
out = relu(x)  # Forward pass

target = torch.randn(*shape)  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()  # Backward pass

# Validation
dout = 2 * (out - target) / np.prod(shape) # derivative of MSELoss
cache = x.clone().detach() # Must clone x so it isn't overwritten
cache[cache <= 0] = 0
dx = cache
dx[dx > 0] = 1
dx_truth = dout * dx
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")


# File name: shape_variable_testNum
test_folder = Path('../../mtt_components/relu/tests')
name = f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}'
it = 1
npy_arrays = {
    'x' : x.detach().numpy(),
    'dx' : x.grad.detach().numpy(),
    'out' : out.detach().numpy(),
    'dout' : dout.detach().numpy(),
}

save_var(test_folder, name, npy_arrays, it)


x gradients are approximately the same: True
Saved variable x to ../../mtt_components/relu/tests
Saved variable dx to ../../mtt_components/relu/tests
Saved variable out to ../../mtt_components/relu/tests
Saved variable dout to ../../mtt_components/relu/tests


In [None]:
# # Affine/Linear layer testdata generation (using arange to create specific weights)

# in_shape = (3, 2, 2) # First val is batch size, rest are input dimensions
# layer_shape = (np.prod(in_shape[1:]), 14) # Shape of linear layer, in_features and out_features

# x = torch.arange(np.prod(in_shape), dtype=torch.float, requires_grad=True)
# x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
# x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

# linear = nn.Linear(*layer_shape)
# w = reversed(torch.arange(np.prod(linear.weight.shape), dtype=torch.float))
# w = w.reshape(linear.weight.shape)
# linear.weight = torch.nn.Parameter(w)
# b = torch.full(linear.bias.shape, 2, dtype=torch.float)
# linear.bias = torch.nn.Parameter(b)
# out = linear(x) # Forward pass

# target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values, defines shape of dout
# loss_fn = nn.MSELoss()
# loss = loss_fn(out, target)  # Compute loss
# loss.backward()

# # Validation
# dout = 2 * (out - target) / np.prod((in_shape[0], layer_shape[1])) # derivative of MSELoss
# dx_truth = dout @ w
# dw_truth = dout.T @ x
# db_truth = torch.sum(dout.T, axis=1)
# print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
# print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
# print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")


In [25]:
# Affine/Linear layer testdata generation (random tensors)
# Remember to make dimensions divisible by block shape (usually 2^n)

in_shape = (8, 3, 2, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 4) # Shape of linear layer, in_features and out_features

x = torch.randn(*in_shape, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
out = linear(x) # This output will also be our dout, since loss is just a sum

target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()

# Validation (TODO: rename variables to val, not truth)
dout = 2 * (out - target) / np.prod((in_shape[0], layer_shape[1])) # derivative of MSELoss
dx_truth = dout @ linear.weight
dw_truth = dout.T @ x
db_truth = torch.sum(dout, axis=0)
out_truth = x @ linear.weight.T + linear.bias
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")
print(f"out values are approximately the same: {torch.allclose(out_truth, out)}")


test_folder = Path('../../mtt_components/linear_forward//tests')
name = f'in_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_out_{layer_shape[1]}'
it = 1
npy_arrays = {
    'x' : x.detach().numpy(),
    'dx' : x.grad.detach().numpy(),
    'w' : linear.weight.detach().numpy(),
    'dw' : linear.weight.grad.detach().numpy(),
    'b' : linear.bias.detach().numpy(),
    'db' : linear.bias.grad.detach().numpy(),
    'out' : out.detach().numpy(),
    'dout' : dout.detach().numpy()
}

save_var(test_folder, name, npy_arrays, it)

# # File name: shape_variable_testNum
# name = f'in_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_out_{layer_shape[1]}'
# with open(f'linear_{name}_x_t{it}.npy', 'wb') as f:
#     np.save(f, x.detach().numpy())
# with open(f'linear_{name}_w_t{it}.npy', 'wb') as f:
#     np.save(f, linear.weight.detach().numpy())
# with open(f'linear_{name}_b_t{it}.npy', 'wb') as f:
#     np.save(f, linear.bias.detach().numpy())
# with open(f'linear_{name}_out_t{it}.npy', 'wb') as f:
#     np.save(f, out.detach().numpy())
# with open(f'linear_{name}_dx_t{it}.npy', 'wb') as f:
#     np.save(f, x.grad.detach().numpy())
# with open(f'linear_{name}_dw_t{it}.npy', 'wb') as f:
#     np.save(f, linear.weight.grad.detach().numpy())
# with open(f'linear_{name}_db_t{it}.npy', 'wb') as f:
#     np.save(f, linear.bias.grad.detach().numpy())
# with open(f'linear_{name}_dout_t{it}.npy', 'wb') as f:
#     np.save(f, dout.detach().numpy()) # Note this is what we computed manually

x gradients are approximately the same: True
w gradients are approximately the same: True
b gradients are approximately the same: True
out values are approximately the same: True
Saved variable x to ../../mtt_components/linear_forward/tests
Saved variable dx to ../../mtt_components/linear_forward/tests
Saved variable w to ../../mtt_components/linear_forward/tests
Saved variable dw to ../../mtt_components/linear_forward/tests
Saved variable b to ../../mtt_components/linear_forward/tests
Saved variable db to ../../mtt_components/linear_forward/tests
Saved variable out to ../../mtt_components/linear_forward/tests
Saved variable dout to ../../mtt_components/linear_forward/tests


In [65]:
# Cross Entropy loss function

in_shape = (4, 10) # Shape [N, C] N= batch, C = channels (logits, i.e. preactivation)

# PyTorch
out = torch.randn(in_shape, requires_grad=True) # Imaginary output of model, logits
target = torch.empty(in_shape[0], dtype=torch.long).random_(10)  # Imaginary target values. Must be type torch.long for CE function

loss_fn = nn.CrossEntropyLoss() # Loss function, default reduction is 'mean'
loss_truth = loss_fn(out, target)  # Compute loss
loss_truth.backward() # Compute gradient
dout_truth = out.grad # PyTorch dout

# Validation
y_out = out.detach().numpy() # Logits are not whole numbers!
y_truth = target.detach().numpy().astype(np.int64) # Equivalent to torch.long

# Forward pass
y_truth_one_hot = np.zeros_like(y_out, dtype=int) # Same shape as model output
y_truth_one_hot[np.arange(y_out.shape[0]), y_truth] = 1 # Assign index of label as 1, others are 0
y_out_exp = np.exp(y_out - np.max(y_out, axis=1, keepdims=True))
y_out_probs = y_out_exp / np.sum(y_out_exp, axis=1, keepdims=True) # Cached for backwards
loss_val = -y_truth_one_hot * np.log(y_out_probs)
loss_val = loss_val.sum(axis=1).mean()

# Backwards pass
dout_val = y_out_probs
dout_val[np.arange(y_out.shape[0]), y_truth] -= 1
dout_val /= y_out.shape[0] # Hand calculated dout to validate

print(f"Losses are approximately the same: {np.allclose(loss_val, loss_truth.detach().numpy())}")
print(f"out gradients are approximately the same: {np.allclose(dout_val, dout_truth.detach().numpy())}")

# File name: shape_variable_testNum
test_folder = Path('../../mtt_components/CE_forward/tests')
name = f'CE_in_{in_shape[0]}_{in_shape[1]}'
it = 1
npy_arrays = {
    'y_truth' : y_truth.astype(np.float32), # Convert to float for HLS
    'y_out' : x.grad.detach().numpy(),
    'loss' : loss_truth.detach().numpy(),
    'dout' : dout_truth.detach().numpy()
}

save_var(test_folder, name, npy_arrays, it)

Losses are approximately the same: True
out gradients are approximately the same: True
Saved variable y_truth to ../../mtt_components/CE_forward/tests
Saved variable y_out to ../../mtt_components/CE_forward/tests
Saved variable loss to ../../mtt_components/CE_forward/tests
Saved variable dout to ../../mtt_components/CE_forward/tests


In [54]:
print(dout_val)
print(dout_truth)

[[ 0.00193277  0.00477011  0.00308699  0.00672569  0.09920373  0.03577068
   0.05166655  0.03266254 -0.24683122  0.01101214]
 [ 0.16031457  0.01290597  0.03026139  0.00292414  0.00321966  0.00724543
  -0.24004994  0.01285327  0.00656767  0.00375785]
 [ 0.02543066 -0.24391416  0.01562056  0.05151001  0.0659417   0.01045887
   0.01792915  0.02642985  0.01992954  0.01066382]
 [ 0.01856777  0.00481262  0.04748836  0.00946206  0.00825876  0.02232138
   0.02003792  0.03949658 -0.23803462  0.06758916]]
tensor([[ 0.0019,  0.0048,  0.0031,  0.0067,  0.0992,  0.0358,  0.0517,  0.0327,
         -0.2468,  0.0110],
        [ 0.1603,  0.0129,  0.0303,  0.0029,  0.0032,  0.0072, -0.2400,  0.0129,
          0.0066,  0.0038],
        [ 0.0254, -0.2439,  0.0156,  0.0515,  0.0659,  0.0105,  0.0179,  0.0264,
          0.0199,  0.0107],
        [ 0.0186,  0.0048,  0.0475,  0.0095,  0.0083,  0.0223,  0.0200,  0.0395,
         -0.2380,  0.0676]])


In [78]:
# Convolution layer testdata generation

padding = 1
stride = 1
kernel = 3
bias = True
# Input dimensions (N, Cin, Hin, Win)
N, Cin, Hin, Win = (7,3,5,5)
# Output dimensions (N, Cout, Hout, Wout)
Cout = 4
Hout = int(1 + (Hin + 2 * padding - kernel)/stride)
Wout = int(1 + (Win + 2 * padding - kernel)/stride)

# Truth
x = torch.randn((N, Cin, Hin, Win), requires_grad=True)
conv = nn.Conv2d(Cin, Cout, kernel_size=kernel, stride=stride, padding=padding, bias=bias)
out_truth = conv(x)
out_truth.retain_grad() # Needed to retain gradient during autograd

# print(Hout)
# print(Wout)
# print(out_truth.shape)

# Validation

weights = conv.weight.detach().numpy()
bias = conv.bias.detach().numpy()
x_padded = np.pad(x.detach().numpy(), padding, pad_with, padder=0) # Pads all dimensions, wasteful
x_padded = x_padded[padding:-padding, padding:-padding] # Strip padding on N and C dimensions
# # print(x_padded.shape)
# # print(x_padded)

# Forward pass
out_val = np.zeros((N, Cout, Hout, Wout))
# print(out_val.shape)

for n in range(N): # Iterate over batch
    for c in range(Cout): # Iterate over out kernels
        for h in range(Hout):
            for w in range(Wout):
                # print(f"({h}, {w})")
                out_val[n, c, h, w] = np.sum(x_padded[n, :, h * stride: h * stride + kernel, w * stride : w * stride + kernel] * weights[c]) + bias[c]

out_truth_npy = out_truth.detach().numpy() # Convert to Numpy array
# Out values are sometimes very small, use a more relaxed relative tolerance (0.1% tolerance)
print(f"Convolution outputs are similar {np.allclose(out_truth_npy, out_val, rtol=1e-3, atol=1e-5)}")

# Backward pass
# Calculate loss through linear layer
lin_out = 7 # Number of out channels, for testing
flatten = nn.Flatten()
flattened = flatten(out_truth)
linear = nn.Linear(Cout * Hout * Wout, lin_out)
lin_out = linear(flattened)
target = torch.randn(lin_out.shape)  # Imaginary target values
loss_fn = nn.CrossEntropyLoss() # Loss function, default reduction is 'mean'
loss_truth = loss_fn(lin_out, target)  # Compute loss
loss_truth.backward() # Compute gradient
dout_truth = out_truth.grad.detach() # Incoming gradient into conv layer, not linear!
# print(out_truth.shape)
# print(dout_truth.shape)

# Useful numpy arrays

x_npy = x.detach().numpy()
dout_npy = dout_truth.detach().numpy()
dw_truth = conv.weight.grad.detach().numpy()
db_truth = conv.bias.grad.detach().numpy()
dx_truth = x.grad.detach().numpy()

# x shape [N, Cin, Hin, Win]
# x padded shape [N, Cin, Hin + padding, Win + padding]
# dw shape [Cout, Cin, kernel, kernel]
# dout shape [N, Cout, Hout, Wout]

# Calculate dw

dw = np.zeros(weights.shape)

# Loops through dw (note that loops do batch N in parallel)
for cout in range(Cout):
    for cin in range(Cin):
        for h in range(kernel):
            for w in range(kernel):
                x_slice = x_padded[:, cin, h:h + Hout * stride:stride, w:w + Wout * stride:stride]
                dout_slice = dout_npy[:, cout]
                dw[cout, cin, h, w] = np.sum(x_slice * dout_slice)

print(f"Gradient of weights are similar {np.allclose(dw, dw_truth, rtol=1e-3, atol=1e-5)}")

# Calculate db

db = np.zeros(bias.shape)
for c in range(Cout):
    db[c] = np.sum(dout_npy[:, c, :, :])

print(f"Gradient of bias are similar {np.allclose(db, db_truth, rtol=1e-3, atol=1e-5)}")

# Calculate dx

dx = np.zeros(dx_truth.shape)







Convolution outputs are similar True
Gradient of weights are similar True
Gradient of bias are similar True


In [67]:
# print(dw[0][-1])
# print(dw_truth[0][-1])

# print(out_truth_npy[0][-1][0][18])
# print(out_val[0][-1][0][18])

# print(out_truth_npy[0][-1][0][18]  - out_val[0][-1][0][18])

0.28572786
0.2857277989387512
5.960464477539063e-08
