In [1]:
import torch
import torch.nn as nn
import numpy as np

In [41]:
# Taken from I2DL exercises, need to cite!

def eval_numerical_gradient_array(f, x, df, h=1e-5):
    """
    Evaluate a numeric gradient for a function that accepts a numpy
    array and returns a numpy array.
    """
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index

        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval

        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad

In [42]:
# ReLU testdata generation

shape = (2, 2, 2)

x = torch.randn(*shape, requires_grad=True)
relu = nn.ReLU()
out = relu(x)  # Forward pass

target = torch.randn(*shape)  # Imaginary target values
loss_fn = nn.MSELoss()
loss = loss_fn(out, target)  # Compute loss
loss.backward()  # Backward pass

print(x)
print(loss)
print(x.grad.shape)
print(x.grad)

tensor([[[-1.2218, -1.6739],
         [-0.1923,  0.5284]],

        [[ 0.4480,  1.6512],
         [-0.9808,  0.1878]]], requires_grad=True)
tensor(2.0314, grad_fn=<MseLossBackward0>)
torch.Size([2, 2, 2])
tensor([[[0.0000, 0.0000],
         [0.0000, 0.2294]],

        [[0.5159, 0.3379],
         [0.0000, 0.3695]]])


In [128]:
# Affine/Linear layer testdata generation (using arange to create specific weights)

in_shape = (3, 2, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 2) # Shape of linear layer, in_features and out_features

x = torch.arange(np.prod(in_shape), dtype=torch.float, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
w = reversed(torch.arange(np.prod(linear.weight.shape), dtype=torch.float))
w = w.reshape(linear.weight.shape)
linear.weight = torch.nn.Parameter(w)
b = torch.full(linear.bias.shape, 2, dtype=torch.float)
linear.bias = torch.nn.Parameter(b)
out = linear(x) # Forward pass

loss = out.sum() # Using a sum for loss makes backprop extremely easy, dout is just a tensor of ones
loss.backward()

dx_truth = torch.ones(*out.shape, dtype=torch.float) @ w
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
dw_truth = torch.ones(*out.shape, dtype=torch.float).T @ x
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
db_truth = torch.sum(torch.ones(*out.shape, dtype=torch.float).T, axis=1)
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")


x gradients are approximately the same: True
w gradients are approximately the same: True
b gradients are approximately the same: True


In [6]:
# Affine/Linear layer testdata generation (random tensors)

in_shape = (4, 3, 2) # First val is batch size, rest are input dimensions
layer_shape = (np.prod(in_shape[1:]), 10) # Shape of linear layer, in_features and out_features

x = torch.randn(*in_shape, requires_grad=True)
x = x.reshape((in_shape[0], np.prod(in_shape[1:]))) # Reshape to shape [batch, prod(input_shape)]
x.retain_grad() # Needed to calculate gradients for tensor views, i.e. reshape

linear = nn.Linear(*layer_shape)
out = linear(x) # This output will also be our dout, since loss is just a sum

loss = out.sum()
loss.backward()

dx_truth = torch.ones(*out.shape, dtype=torch.float) @ linear.weight
print(f"x gradients are approximately the same: {torch.allclose(dx_truth, x.grad)}")
dw_truth = torch.ones(*out.shape, dtype=torch.float).T @ x
print(f"w gradients are approximately the same: {torch.allclose(dw_truth, linear.weight.grad)}")
db_truth = torch.sum(torch.ones(*out.shape, dtype=torch.float).T, axis=1)
print(f"b gradients are approximately the same: {torch.allclose(db_truth, linear.bias.grad)}")

x gradients are approximately the same: True
w gradients are approximately the same: True
b gradients are approximately the same: True


In [35]:
x = np.random.rand(4, 3, 10,10).astype(np.float32)
out = np.zeros(100).astype(np.float32)
x = torch.from_numpy(x)
x.requires_grad = True
layer = nn.Linear(np.prod(x.shape), np.prod(out.shape))
out = layer(x.view(-1))

loss = out.sum()
loss.backward()

print(x.shape)
print(layer.weight.shape)


torch.Size([4, 3, 10, 10])
torch.Size([100, 1200])
