In [1]:

import torch
# The autograd package provides automatic differentiation 
# for all operations on Tensors

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [9]:
# requires_grad = True -> tracks all operations on the tensor. 
x = torch.randn(3, requires_grad=True)
y = x + 2

In [10]:
y
# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor

tensor([2.6775, 1.4151, 3.0411], grad_fn=<AddBackward0>)

In [13]:
y.backward() # y not give scaler output that is only one value no vector or matrix

RuntimeError: grad can be implicitly created only for scalar outputs

In [14]:
print(x.grad)

None


In [15]:
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

tensor([ 0.6775, -0.5849,  1.0411], requires_grad=True)
tensor([2.6775, 1.4151, 3.0411], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x000001B94AB7AEC0>


In [16]:
# Do more operations on y
z = y * y * 3  # no scaler output
print(z)

tensor([21.5065,  6.0071, 27.7453], grad_fn=<MulBackward0>)


In [17]:
z = z.mean() # scaler output
print(z)


tensor(18.4197, grad_fn=<MeanBackward0>)


In [19]:
z

tensor(18.4197, grad_fn=<MeanBackward0>)

In [18]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor
z.backward()
print(x.grad) # dz/dx

tensor([5.3549, 2.8301, 6.0823])


In [20]:

# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derivates while applying the chain rule
# -------------
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)
print(x) # vector

y = x * 2
print(y) # vetor

for _ in range(10):
    y = y * 2

print(y)  #vector
print(y.shape)

tensor([ 0.4278, -1.0727, -1.3087], requires_grad=True)
tensor([ 0.8555, -2.1455, -2.6173], grad_fn=<MulBackward0>)
tensor([  876.0814, -2196.9807, -2680.1462], grad_fn=<MulBackward0>)
torch.Size([3])


In [21]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


In [None]:

# -------------
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b) # vector
print(b.grad_fn)


tensor([[-0.9152, -0.1333],
        [-0.0485, -0.4178]])
False
tensor([[1.4335, 0.3530],
        [0.1387, 0.8840]])
None


In [24]:
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b) # scaler
print(b.grad_fn)

True
tensor(1.0322, grad_fn=<SumBackward0>)
<SumBackward0 object at 0x000001B94B017C10>


In [25]:

# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a)
print(a.requires_grad)
b = a.detach()
print(b)
print(b.requires_grad)

tensor([[ 0.0027,  0.3182],
        [-0.4651,  0.3628]], requires_grad=True)
True
tensor([[ 0.0027,  0.3182],
        [-0.4651,  0.3628]])
False


In [27]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a)
print(a.requires_grad)


tensor([[-0.7106, -0.8682],
        [ 0.8596, -0.8539]], requires_grad=True)
True


In [28]:
with torch.no_grad():
    print((a ** 2).requires_grad)
    print(a**2)

False
tensor([[0.5050, 0.7537],
        [0.7388, 0.7292]])


step in training = optimization step

In [30]:
# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

In [31]:
weights

tensor([1., 1., 1., 1.], requires_grad=True)

In [35]:

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum() # scaler
    print('model output',model_output)
    model_output.backward()
    
    print(weights.grad) # weights*3

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    print('weights',weights)

    # this is important! It affects the final weights & output
    weights.grad.zero_()  #se .zero_() to empty the gradients before a new optimization step!


# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()

model output tensor(-20.4000, grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
weights tensor([-2., -2., -2., -2.], requires_grad=True)
model output tensor(-24., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
weights tensor([-2.3000, -2.3000, -2.3000, -2.3000], requires_grad=True)
model output tensor(-27.6000, grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
weights tensor([-2.6000, -2.6000, -2.6000, -2.6000], requires_grad=True)
