In [2]:
import torch

x = torch.ones(5)
y = torch.zeros(3)
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

$w$ and $b$ are parameters to be optimized. We need to compute the gradients w.r.t. those parameters. Therefore, "requires_grad" is set as True. A function that we apply to tensors to construcrt computational graph is an objective of class "Function". This object knows how to compute the function in both forward and backward directions. The backward propagation function is stored in "grad_fn".

In [3]:
print(f'Gradint function for z = {z.grad_fn}')
print(f'Gradint function for loss = {loss.grad_fn}')

Gradint function for z = <AddBackward0 object at 0x7fccb85c48d0>
Gradint function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7fccb85b7090>


In [4]:
# Compute gradient
loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.2544, 0.2658, 0.1443],
        [0.2544, 0.2658, 0.1443],
        [0.2544, 0.2658, 0.1443],
        [0.2544, 0.2658, 0.1443],
        [0.2544, 0.2658, 0.1443]])
tensor([0.2544, 0.2658, 0.1443])


We can only perform gradient calculations using "backward" once on a given graph. If we need to do several backward calls on the same graph, we need to pass "retain_graph=True" to the "backward" call.

In [6]:
# Disabling gradient tracking
# no_grad()
z = torch.matmul(x, w) + b
print(z.requires_grad)
with torch.no_grad():
    z = torch.matmul(x, w) + b
    print(z.requires_grad)

# detach()
z = torch.matmul(x, w) + b
z = z.detach()
print(z.requires_grad)

True
False
False


The reasons to disable gradient tracking:
1. Some parameters are frozen parameters.
2. Speed up if only forward pass is needed.

In [10]:
inp = torch.eye(4, 5, requires_grad=True)
out = (inp + 1).pow(2).t()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])


PyTorch accumulates the gradients!