# Gradient

In [1]:
import torch
# implicitly assert dtype
x = torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [2]:
# find a place to store gradient--x.grad
x.requires_grad_(True)
x.requires_grad=True

In [3]:
x.requires_grad_()

tensor([0., 1., 2., 3.], requires_grad=True)

In [4]:
# implicitly construct map--caculate gradient function 
y = 2*torch.dot(x,x)
y

tensor(28., grad_fn=<MulBackward0>)

## Back Propagation

In [5]:
y.backward() # derive gradient, store in x.grad
x.grad

tensor([ 0.,  4.,  8., 12.])

In [6]:
x.grad == 4*x

tensor([True, True, True, True])

In [7]:
# pytorch will automatically accumulate gradient
# so you have to clear/empty gradient manually
x.grad.zero_() # function name ending with _ means this function will change value
y = x.sum()
y.backward()
x.grad

tensor([1., 1., 1., 1.])

In [21]:
y = x*x
y

tensor([0., 1., 4., 9.], grad_fn=<MulBackward0>)

In [22]:
# generally backward is used for a scalar
# when y is a matrix, we usually use y.sum() for back propagation
x.grad.zero_()
# each time you call backward(), it will clear the computation graph
# each back propagation need once forward propagation
y.sum().backward() 
x.grad

tensor([0., 2., 4., 6.])

## Partial Propagation

In [10]:
# sometimes we want to 'freeze' part of net, which means we only want to update part gradients
# detach() will cut off back propagation
x.grad.zero_()
y = x*x
u = y.detach() # y is a function of x, but u is not(y is seen as a scalar)
z = u*x

# part1
z.sum().backward()
x.grad == u

tensor([True, True, True, True])

In [11]:
# part2
x.grad.zero_()
y.sum().backward()

x.grad == 2*x

tensor([True, True, True, True])

### pytorch will implicitly construct Computation Graph(tracking anything you do). It is quite handy, but relatively slow(compare with explicitly Computation Graph construction)