In [1]:
import torch
# The autograd package provides automatic differentiation for all operations on Tensors

# requires_grad = True -> tracks all operations on the tensor. 
x = torch.tensor([3.0], requires_grad=True)
print(x)
y = x*x + 2

tensor([3.], requires_grad=True)


In [2]:
# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

tensor([3.], requires_grad=True)
tensor([11.], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x00000270830E3A90>


In [3]:
y.backward()
print(x.grad)

tensor([6.])


### Example 2

In [4]:
x = torch.tensor([1.0,2.0,3.0], requires_grad=True)
y = x + 2
print(x)
print(y)

tensor([1., 2., 3.], requires_grad=True)
tensor([3., 4., 5.], grad_fn=<AddBackward0>)


In [5]:
# more operation on y
z = y * y * 3
print(z)
z = z.mean()
print(z)

tensor([27., 48., 75.], grad_fn=<MulBackward0>)
tensor(50., grad_fn=<MeanBackward0>)


Gradients are calculated by tracing the graph from the root to the leaf and multiplying every gradient in the way using the chain rule.

In [6]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

z.backward()
print(x.grad) # dz/dx

tensor([ 6.,  8., 10.])


### Example

In [7]:
import torch  
x=torch.tensor(2.0, requires_grad=True)  
z=torch.tensor(4.0, requires_grad=True)  
print(x)
print(z)

tensor(2., requires_grad=True)
tensor(4., requires_grad=True)


In [8]:
y=x**2+z**3  
y.backward()  
print(x.grad)
print(z.grad) 

tensor(4.)
tensor(48.)


### Example

In [9]:
a = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.float, requires_grad=True)
print(a)
for i in range(2):
  for j in range(3):
    out = a[i,j] * a[i,j]
    out.backward()
print(a.grad)

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)
tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]])


In [10]:
a = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.float,requires_grad=True)
print(a)
print(a.shape)
out = a * a
out.backward(a)
print(a.grad)

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)
torch.Size([2, 3])
tensor([[ 2.,  8., 18.],
        [32., 50., 72.]])


### Example

In [11]:

x = torch.randn(3, requires_grad=True)
print(x)

tensor([-0.9312, -0.5709, -1.9604], requires_grad=True)


In [12]:
y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

tensor([-1907.0762, -1169.1670, -4014.8333], grad_fn=<MulBackward0>)
torch.Size([3])


In [13]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


## Stop a tensor from tracking history:

### For example during our training loop when we want to update our weights
### then this update operation should not be part of the gradient computation
### - x.requires_grad_(False)
### - x.detach()
### - wrap in 'with torch.no_grad():'


In [14]:
# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x0000027086B0C550>


In [15]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

True
False


In [16]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

True
False


### Example

In [17]:
# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

In [18]:
for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)


### Example : backprop

In [19]:
import torch

x = torch.tensor(1.0)
y = torch.tensor(2.0)

# This is the parameter we want to optimize -> requires_grad=True
w = torch.tensor(1.0, requires_grad=True)

print(x)
print(y)
print(w)

tensor(1.)
tensor(2.)
tensor(1., requires_grad=True)


In [20]:
# forward pass to compute loss
y_predicted = w * x
print(y_predicted)
loss = (y_predicted - y)**2
print(loss)

tensor(1., grad_fn=<MulBackward0>)
tensor(1., grad_fn=<PowBackward0>)


In [21]:
# backward pass to compute gradient dLoss/dw
loss.backward()
print(w.grad)

tensor(-2.)


In [22]:

# update weights
# next forward and backward pass...

# continue optimizing:
# update weights, this operation should not be part of the computational graph
with torch.no_grad():
    w -= 0.01 * w.grad
# don't forget to zero the gradients
w.grad.zero_()

tensor(0.)

### Example : Gradient descent 

In [23]:
import numpy as np 

# Compute every step manually

# Linear regression
# f = w * x 

# here : f = 2 * x
X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([2, 4, 6, 8], dtype=np.float32)

w = 0.0

print(X)
print(Y)
print(w)

[1. 2. 3. 4.]
[2. 4. 6. 8.]
0.0


In [24]:
# model output
def forward(x):
    return w * x

# loss = MSE
def loss(y, y_pred):
    return ((y_pred - y)**2).mean()

# J = MSE = 1/N * (w*x - y)**2
# dJ/dw = 1/N * 2x(w*x - y)
def gradient(x, y, y_pred):
    return np.dot(2*x, y_pred - y).mean()

print(f'Prediction before training: f(5) = {forward(5):.3f}')


Prediction before training: f(5) = 0.000


In [25]:
# Training
learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    # predict = forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)
    
    # calculate gradients
    dw = gradient(X, Y, y_pred)

    # update weights
    w -= learning_rate * dw

    if epoch % 2 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
     
print(f'Prediction after training: f(5) = {forward(5):.3f}')

epoch 1: w = 1.200, loss = 30.00000000
epoch 3: w = 1.872, loss = 0.76800019
epoch 5: w = 1.980, loss = 0.01966083
epoch 7: w = 1.997, loss = 0.00050332
epoch 9: w = 1.999, loss = 0.00001288
epoch 11: w = 2.000, loss = 0.00000033
epoch 13: w = 2.000, loss = 0.00000001
epoch 15: w = 2.000, loss = 0.00000000
epoch 17: w = 2.000, loss = 0.00000000
epoch 19: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000


### Example: Gradient descent with autograd

In [26]:
import torch

# Here we replace the manually computed gradient with autograd

# Linear regression
# f = w * x 

# here : f = 2 * x
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

print(X)
print(Y)
print(w)

tensor([1., 2., 3., 4.])
tensor([2., 4., 6., 8.])
tensor(0., requires_grad=True)


In [27]:
# model output
def forward(x):
    return w * x

# loss = MSE
def loss(y, y_pred):
    return ((y_pred - y)**2).mean()

print(f'Prediction before training: f(5) = {forward(5).item():.3f}')

Prediction before training: f(5) = 0.000


In [28]:
# Training
learning_rate = 0.01
n_iters = 100

for epoch in range(n_iters):
    # predict = forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)

    # calculate gradients = backward pass
    l.backward()

    # update weights
    #w.data = w.data - learning_rate * w.grad
    with torch.no_grad():
        w -= learning_rate * w.grad
    
    # zero the gradients after updating
    w.grad.zero_()

    if epoch % 10 == 0:
        print(f'epoch {epoch+1}: w = {w.item():.3f}, loss = {l.item():.8f}')

print(f'Prediction after training: f(5) = {forward(5).item():.3f}')

epoch 1: w = 0.300, loss = 30.00000000
epoch 11: w = 1.665, loss = 1.16278565
epoch 21: w = 1.934, loss = 0.04506890
epoch 31: w = 1.987, loss = 0.00174685
epoch 41: w = 1.997, loss = 0.00006770
epoch 51: w = 1.999, loss = 0.00000262
epoch 61: w = 2.000, loss = 0.00000010
epoch 71: w = 2.000, loss = 0.00000000
epoch 81: w = 2.000, loss = 0.00000000
epoch 91: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000
