In [40]:
import torch
import torch.nn as nn
import numpy as np
from pathlib import Path

In [None]:
"""
TODO:
- MSEloss for all layer tests
- Proper data generation method (multiple datasets needed to test everything)
- Test for ints and big float as well

"""

In [None]:
# Taken from I2DL exercises, need to cite!

# def eval_numerical_gradient_array(f, x, df, h=1e-5):
#     """
#     Evaluate a numeric gradient for a function that accepts a numpy
#     array and returns a numpy array.
#     """
#     grad = np.zeros_like(x)
#     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
#     while not it.finished:
#         ix = it.multi_index

#         oldval = x[ix]
#         x[ix] = oldval + h
#         pos = f(x).copy()
#         x[ix] = oldval - h
#         neg = f(x).copy()
#         x[ix] = oldval

#         grad[ix] = np.sum((pos - neg) * df) / (2 * h)
#         it.iternext()
#     return grad

In [None]:
# ReLU testdata generation

# First val is batch size, rest are dimensions of data.
# Numbers are absed on input shape of MTT ReLU layers
# shape = (256, 128, 32, 32)
shape = (4, 3, 2, 2)

x = torch.randn(*shape, dtype=torch.float, requires_grad=True)
relu = nn.ReLU()
out = relu(x)  # Forward pass

target = torch.randn(*shape)  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()  # Backward pass

# Validation
dout = 2 * (out - target) / np.prod(shape) # derivative of MSELoss
cache = x.clone().detach() # Must clone x so it isn't overwritten
cache[cache <= 0] = 0
dx = cache
dx[dx > 0] = 1
dx_truth = dout * dx
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")

# File name: shape_variable_testNum
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_x_t1.npy', 'wb') as f:
    np.save(f, x.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_dx_t1.npy', 'wb') as f:
    np.save(f, x.grad.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_out_t1.npy', 'wb') as f:
    np.save(f, out.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_dout_t1.npy', 'wb') as f:
    np.save(f, dout.detach().numpy())


In [None]:
# print(x.grad)
# print(cache)
# print(dout)

In [None]:
# Affine/Linear layer testdata generation (using arange to create specific weights)

in_shape = (3, 2, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 14) # Shape of linear layer, in_features and out_features

x = torch.arange(np.prod(in_shape), dtype=torch.float, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
w = reversed(torch.arange(np.prod(linear.weight.shape), dtype=torch.float))
w = w.reshape(linear.weight.shape)
linear.weight = torch.nn.Parameter(w)
b = torch.full(linear.bias.shape, 2, dtype=torch.float)
linear.bias = torch.nn.Parameter(b)
out = linear(x) # Forward pass

target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()

# Validation
dout = 2 * (out - target) / np.prod((in_shape[0], layer_shape[1])) # derivative of MSELoss
dx_truth = dout @ w
dw_truth = dout.T @ x
db_truth = torch.sum(dout.T, axis=1)
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")


In [None]:
# Affine/Linear layer testdata generation (random tensors)
# Remember to make dimensions divisible by block shape (usually 2^n)

in_shape = (8, 3, 2, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 4) # Shape of linear layer, in_features and out_features

x = torch.randn(*in_shape, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
out = linear(x) # This output will also be our dout, since loss is just a sum

target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()

# Validation
dout = 2 * (out - target) / np.prod((in_shape[0], layer_shape[1])) # derivative of MSELoss
dx_truth = dout @ linear.weight
dw_truth = dout.T @ x
db_truth = torch.sum(dout, axis=0)
out_truth = x @ linear.weight.T + linear.bias
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")
print(f"out values are approximately the same: {torch.allclose(out_truth, out)}")

it = 1
# File name: shape_variable_testNum
name = f'in_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_out_{layer_shape[1]}'
with open(f'linear_{name}_x_t{it}.npy', 'wb') as f:
    np.save(f, x.detach().numpy())
with open(f'linear_{name}_w_t{it}.npy', 'wb') as f:
    np.save(f, linear.weight.detach().numpy())
with open(f'linear_{name}_b_t{it}.npy', 'wb') as f:
    np.save(f, linear.bias.detach().numpy())
with open(f'linear_{name}_out_t{it}.npy', 'wb') as f:
    np.save(f, out.detach().numpy())
with open(f'linear_{name}_dx_t{it}.npy', 'wb') as f:
    np.save(f, x.grad.detach().numpy())
with open(f'linear_{name}_dw_t{it}.npy', 'wb') as f:
    np.save(f, linear.weight.grad.detach().numpy())
with open(f'linear_{name}_db_t{it}.npy', 'wb') as f:
    np.save(f, linear.bias.grad.detach().numpy())
with open(f'linear_{name}_dout_t{it}.npy', 'wb') as f:
    np.save(f, dout.detach().numpy()) # Note this is what we computed manually

In [None]:

# print(dout.T.shape)
# print(dout.T)
# print(linear.weight.grad)
# print(linear.bias.grad)

# print(test[0][0][1][1])
# print(test.T[1][1][0][0])


In [55]:
# Cross Entropy loss function

in_shape = (4, 10) # Shape [N, C] N= batch, C = channels (logits, i.e. preactivation)

# PyTorch
out = torch.randn(in_shape, requires_grad=True) # Imaginary output of model, logits
target = torch.empty(in_shape[0], dtype=torch.long).random_(10)  # Imaginary target values. Must be type torch.long for CE function
# print(out)
# print(target)

loss_fn = nn.CrossEntropyLoss() # Loss function, default reduction is 'mean'
loss_truth = loss_fn(out, target)  # Compute loss
loss_truth.backward() # Compute gradient
dout = out.grad
print(loss_truth)

# # Validation
y_out = out.detach().numpy() # Logits are not whole numbers!
y_truth = target.detach().numpy().astype(np.int64) # Equivalent to torch.long
# print(y_out)
# print(y_truth)

y_truth_one_hot = np.zeros_like(y_out, dtype=int) # Same shape as model output
y_truth_one_hot[np.arange(y_out.shape[0]), y_truth] = 1 # Assign index of label as 1, others are 0
y_out_exp = np.exp(y_out - np.max(y_out, axis=1, keepdims=True))
y_out_probs = y_out_exp / np.sum(y_out_exp, axis=1, keepdims=True)
loss_val = -y_truth_one_hot * np.log(y_out_probs)
loss_val = loss_val.sum(axis=1).mean()

print(loss_val)

it = 1
# File name: shape_variable_testNum
test_folder = Path('../../mtt_components/CE_forward/tests')
name = f'in_{in_shape[0]}_{in_shape[1]}'
with open(test_folder / f'CE_{name}_y_truth_t{it}.npy', 'wb') as f:
    np.save(f, y_truth.astype(np.float32)) # Convert to float for HLS
with open(test_folder / f'CE_{name}_y_out_t{it}.npy', 'wb') as f:
    np.save(f, y_out)
with open(test_folder / f'CE_{name}_loss_t{it}.npy', 'wb') as f:
    np.save(f, loss_truth.detach().numpy())

tensor(2.9119, grad_fn=<NllLossBackward0>)
2.9119134545326233


In [56]:
print(y_out)
print(y_truth)
print(loss_truth)

[[-1.3533213  -0.17553462  0.47401047 -1.0226315  -0.13071758 -0.31051773
  -0.31433147 -1.016058   -0.9099716  -1.1300509 ]
 [ 2.191882    0.71398014  0.85098857  0.48754644  0.2830935  -1.0685333
  -0.88551867  1.300485    0.1930931   0.26251826]
 [-0.62347263  0.86463994  0.83694935  0.7673269  -0.31105527 -0.26043183
   0.3485074  -0.06242946  0.49741066  1.0055048 ]
 [ 1.2515155  -2.0488918  -0.51894236 -1.0815811  -1.4843855  -0.6622613
  -0.59916055 -0.88804454 -0.2516484  -1.3877795 ]]
[8 6 8 6]
tensor(2.9119, grad_fn=<NllLossBackward0>)
