In [15]:
import torch
# The autograd package provides automatic differentiation 
# for all operations on Tensors
x = torch.randn(3,requires_grad=True)
# requires_grad = True -> tracks all operations on the tensor. 
y = x+2

# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

tensor([ 0.0841, -0.4883, -0.4887], requires_grad=True)
tensor([2.0841, 1.5117, 1.5113], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x7f22c87eb358>


In [32]:
# Do more operations on y
z = y * y * 3
print(z)
z = z.mean()
print(z)
z = z/2
print(z)

tensor([13.0299,  6.8560,  6.8525], grad_fn=<MulBackward0>)
tensor(8.9128, grad_fn=<MeanBackward0>)
tensor(4.4564, grad_fn=<DivBackward0>)


In [33]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

z.backward()
print(x.grad) # dz/dx

tensor([10.4203,  7.5586,  7.5567])


In [38]:
a = torch.randn(6,requires_grad=True)
b = torch.randn(6,requires_grad=True)
c = a + b
print(c)
c = c.mean()
print(a)
print(b)
print(c) 
c.backward()
print(a.grad)
print(b.grad)

tensor([-0.5306, -2.1249, -0.3306,  0.6597, -1.5669, -1.8413],
       grad_fn=<AddBackward0>)
tensor([-2.2768, -1.6085, -0.7734,  0.4668, -0.4574, -1.6058],
       requires_grad=True)
tensor([ 1.7462, -0.5164,  0.4428,  0.1929, -1.1096, -0.2355],
       requires_grad=True)
tensor(-0.9558, grad_fn=<MeanBackward0>)
tensor([0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667])
tensor([0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667])


In [56]:
# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derivates while applying the chain rule

# -------------
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)
y = x * 2

for _ in range(10):
    y = y * 2
print(y)
print(y.shape)

v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

tensor([ -206.7797, -2248.5393,  1711.2466], grad_fn=<MulBackward0>)
torch.Size([3])
tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


In [70]:
# -------------
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)
b.backward()
print(a.grad)

False
None
True
<SumBackward0 object at 0x7f2275ae2278>
tensor([[ 1.6903,  1.6002],
        [-0.1939, -1.8499]])


In [85]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

True
False


In [97]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)
    print(x)
    print((a*2).requires_grad)
    print(a.requires_grad)
print(a)

True
False
tensor([-0.3300], requires_grad=True)
False
True
tensor([[-1.6050, -0.0854],
        [-0.4412, -0.5626]], requires_grad=True)


In [104]:
# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)
print(weights)
for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()

tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)
