In [2]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
"""
TODO:
- MSEloss for all layer tests
- Proper data generation method (multiple datasets needed to test everything)
- Test for ints and big float as well

"""

In [2]:
# Taken from I2DL exercises, need to cite!

# def eval_numerical_gradient_array(f, x, df, h=1e-5):
#     """
#     Evaluate a numeric gradient for a function that accepts a numpy
#     array and returns a numpy array.
#     """
#     grad = np.zeros_like(x)
#     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
#     while not it.finished:
#         ix = it.multi_index

#         oldval = x[ix]
#         x[ix] = oldval + h
#         pos = f(x).copy()
#         x[ix] = oldval - h
#         neg = f(x).copy()
#         x[ix] = oldval

#         grad[ix] = np.sum((pos - neg) * df) / (2 * h)
#         it.iternext()
#     return grad

In [51]:
# ReLU testdata generation

# First val is batch size, rest are dimensions of data.
# Numbers are absed on input shape of MTT ReLU layers
# shape = (256, 128, 32, 32)
shape = (4, 3, 2, 2)

x = torch.randn(*shape, dtype=torch.float, requires_grad=True)
relu = nn.ReLU()
out = relu(x)  # Forward pass

target = torch.randn(*shape)  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()  # Backward pass

# Validation
dout = 2 * (out - target) / np.prod(shape) # derivative of MSELoss
cache = x.clone().detach() # Must clone x so it isn't overwritten
cache[cache <= 0] = 0
dx = cache
dx[dx > 0] = 1
dx_truth = dout * dx
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")

# File name: shape_variable_testNum
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_x_t1.npy', 'wb') as f:
    np.save(f, x.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_dx_t1.npy', 'wb') as f:
    np.save(f, x.grad.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_out_t1.npy', 'wb') as f:
    np.save(f, out.detach().numpy())
with open(f'relu_{shape[0]}_{shape[1]}_{shape[2]}_{shape[3]}_dout_t1.npy', 'wb') as f:
    np.save(f, dout.detach().numpy())


x gradients are approximately the same: True


In [52]:
# print(x.grad)
# print(cache)
# print(dout)

tensor([[[[ 0.0000,  0.0000],
          [-0.0088,  0.0525]],

         [[ 0.0000,  0.0000],
          [ 0.0750,  0.0000]],

         [[ 0.0094,  0.0726],
          [ 0.1179,  0.0531]]],


        [[[ 0.0516,  0.0000],
          [ 0.0000,  0.0424]],

         [[ 0.1115,  0.0000],
          [ 0.1554,  0.0000]],

         [[ 0.0000,  0.0000],
          [ 0.0397,  0.1478]]],


        [[[ 0.0000, -0.0853],
          [ 0.0153,  0.0000]],

         [[ 0.0000,  0.0000],
          [ 0.0388,  0.0000]],

         [[ 0.0000,  0.0000],
          [ 0.0000,  0.1355]]],


        [[[ 0.0000,  0.0079],
          [-0.0631,  0.0778]],

         [[ 0.0000,  0.0000],
          [ 0.0000,  0.0038]],

         [[ 0.0000,  0.0000],
          [ 0.0000,  0.0000]]]])


In [52]:
# Affine/Linear layer testdata generation (using arange to create specific weights)

in_shape = (3, 2, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 14) # Shape of linear layer, in_features and out_features

x = torch.arange(np.prod(in_shape), dtype=torch.float, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
w = reversed(torch.arange(np.prod(linear.weight.shape), dtype=torch.float))
w = w.reshape(linear.weight.shape)
linear.weight = torch.nn.Parameter(w)
b = torch.full(linear.bias.shape, 2, dtype=torch.float)
linear.bias = torch.nn.Parameter(b)
out = linear(x) # Forward pass

target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()

# Validation
dout = 2 * (out - target) / np.prod((in_shape[0], layer_shape[1])) # derivative of MSELoss
dx_truth = dout @ w
dw_truth = dout.T @ x
db_truth = torch.sum(dout.T, axis=1)
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")


x gradients are approximately the same: True
w gradients are approximately the same: True
b gradients are approximately the same: True


In [5]:
# Affine/Linear layer testdata generation (random tensors)

in_shape = (17, 7, 3, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 5) # Shape of linear layer, in_features and out_features

x = torch.randn(*in_shape, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
out = linear(x) # This output will also be our dout, since loss is just a sum

target = torch.randn(in_shape[0], layer_shape[1])  # Imaginary target values, defines shape of dout
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()

# Validation
dout = 2 * (out - target) / np.prod((in_shape[0], layer_shape[1])) # derivative of MSELoss
dx_truth = dout @ linear.weight
dw_truth = dout.T @ x
db_truth = torch.sum(dout.T, axis=1)
out_truth = x @ linear.weight.T + linear.bias
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")
print(f"out values are approximately the same: {torch.allclose(out_truth, out)}")

it = 1
# File name: shape_variable_testNum
with open(f'linear_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_x_t{it}.npy', 'wb') as f:
    np.save(f, x.detach().numpy())
with open(f'linear_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_w_t{it}.npy', 'wb') as f:
    np.save(f, linear.weight.detach().numpy())
with open(f'linear_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_b_t{it}.npy', 'wb') as f:
    np.save(f, linear.bias.detach().numpy())
with open(f'linear_{in_shape[0]}_{in_shape[1]}_{in_shape[2]}_{in_shape[3]}_out_t{it}.npy', 'wb') as f:
    np.save(f, out.detach().numpy())

x gradients are approximately the same: True
w gradients are approximately the same: True
b gradients are approximately the same: True
out values are approximately the same: True


In [13]:
print(linear.weight)

# test = torch.randn(*in_shape)
# print(test.shape)
# print(test.T.shape)

# print(test[0][0][1][1])
# print(test.T[1][1][0][0])


Parameter containing:
tensor([[-0.0009,  0.0519, -0.0952,  0.0587,  0.1172,  0.1493, -0.0471, -0.0320,
         -0.0374, -0.0195,  0.0899,  0.1536, -0.0031, -0.0105, -0.1482, -0.1202,
         -0.1418, -0.1122, -0.0889,  0.1231,  0.0137, -0.0442,  0.0121,  0.0925,
          0.0998, -0.0376,  0.0428, -0.1287,  0.0246, -0.0839, -0.0283, -0.0336,
          0.0380, -0.0930, -0.0779,  0.0025, -0.0913, -0.0164,  0.1535, -0.0677,
         -0.1474,  0.0769],
        [-0.0817, -0.0388,  0.1100,  0.0509, -0.1261, -0.0598,  0.0954, -0.0478,
          0.0448, -0.0449,  0.0140,  0.0750, -0.0934,  0.0240,  0.0266,  0.1399,
          0.0580,  0.0172, -0.1061,  0.0334,  0.1092,  0.1407,  0.0427, -0.0187,
         -0.1498,  0.1530,  0.0888, -0.1271,  0.0596,  0.0105, -0.1154,  0.0004,
          0.1420,  0.1227, -0.1536, -0.1067, -0.1014,  0.0750,  0.0532,  0.0105,
         -0.0728, -0.0335],
        [ 0.0091,  0.0265,  0.0566, -0.1167, -0.1236, -0.1133,  0.0863,  0.1491,
         -0.0916, -0.0705,  0.1