In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
# Taken from I2DL exercises, need to cite!

# def eval_numerical_gradient_array(f, x, df, h=1e-5):
#     """
#     Evaluate a numeric gradient for a function that accepts a numpy
#     array and returns a numpy array.
#     """
#     grad = np.zeros_like(x)
#     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
#     while not it.finished:
#         ix = it.multi_index

#         oldval = x[ix]
#         x[ix] = oldval + h
#         pos = f(x).copy()
#         x[ix] = oldval - h
#         neg = f(x).copy()
#         x[ix] = oldval

#         grad[ix] = np.sum((pos - neg) * df) / (2 * h)
#         it.iternext()
#     return grad

In [51]:
# ReLU testdata generation

# First val is batch size, rest are dimensions of data.
# Numbers are absed on input shape of MTT ReLU layers
# shape = (256, 128, 32, 32)
shape = (4, 3, 2, 2)

x = torch.randn(*shape, dtype=torch.float, requires_grad=True)
relu = nn.ReLU()
out = relu(x)  # Forward pass

target = torch.randn(*shape)  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()  # Backward pass

# Validation
dout = 2 * (out - target) / np.prod(shape) # derivative of MSELoss
cache = x.clone().detach() # Must clone x so it isn't overwritten
cache[cache <= 0] = 0
dx = cache
dx[dx > 0] = 1
dx_truth = dout * dx
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")

# File name: shape_variable_testNum
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_x_t1.npy', 'wb') as f:
    np.save(f, x.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_dx_t1.npy', 'wb') as f:
    np.save(f, x.grad.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_out_t1.npy', 'wb') as f:
    np.save(f, out.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_dout_t1.npy', 'wb') as f:
    np.save(f, dout.detach().numpy())


x gradients are approximately the same: True


In [52]:
print(x.grad)
# print(cache)
# print(dout)

tensor([[[[ 0.0000,  0.0000],
          [-0.0088,  0.0525]],

         [[ 0.0000,  0.0000],
          [ 0.0750,  0.0000]],

         [[ 0.0094,  0.0726],
          [ 0.1179,  0.0531]]],


        [[[ 0.0516,  0.0000],
          [ 0.0000,  0.0424]],

         [[ 0.1115,  0.0000],
          [ 0.1554,  0.0000]],

         [[ 0.0000,  0.0000],
          [ 0.0397,  0.1478]]],


        [[[ 0.0000, -0.0853],
          [ 0.0153,  0.0000]],

         [[ 0.0000,  0.0000],
          [ 0.0388,  0.0000]],

         [[ 0.0000,  0.0000],
          [ 0.0000,  0.1355]]],


        [[[ 0.0000,  0.0079],
          [-0.0631,  0.0778]],

         [[ 0.0000,  0.0000],
          [ 0.0000,  0.0038]],

         [[ 0.0000,  0.0000],
          [ 0.0000,  0.0000]]]])


In [None]:
# Affine/Linear layer testdata generation (using arange to create specific weights)

in_shape = (3, 2, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 2) # Shape of linear layer, in_features and out_features

x = torch.arange(np.prod(in_shape), dtype=torch.float, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
w = reversed(torch.arange(np.prod(linear.weight.shape), dtype=torch.float))
w = w.reshape(linear.weight.shape)
linear.weight = torch.nn.Parameter(w)
b = torch.full(linear.bias.shape, 2, dtype=torch.float)
linear.bias = torch.nn.Parameter(b)
out = linear(x) # Forward pass

loss = out.sum() # Using a sum for loss makes backprop extremely easy, dout is just a tensor of ones
loss.backward()

dx_truth = torch.ones(*out.shape, dtype=torch.float) @ w
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
dw_truth = torch.ones(*out.shape, dtype=torch.float).T @ x
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
db_truth = torch.sum(torch.ones(*out.shape, dtype=torch.float).T, axis=1)
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")


In [None]:
# Affine/Linear layer testdata generation (random tensors)

in_shape = (4, 3, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 10) # Shape of linear layer, in_features and out_features

x = torch.randn(*in_shape, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
out = linear(x) # This output will also be our dout, since loss is just a sum

loss = out.sum()
loss.backward()

dx_truth = torch.ones(*out.shape, dtype=torch.float) @ linear.weight
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
dw_truth = torch.ones(*out.shape, dtype=torch.float).T @ x
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
db_truth = torch.sum(torch.ones(*out.shape, dtype=torch.float).T, axis=1)
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")