In [89]:
import torch
import numpy as np

### Creating data:
1. x requires_grad=True
2. y requires_grad=False

In [90]:
x= torch.tensor([[1,2,3],[4,5,6],[1,2,3]],dtype=torch.float32,requires_grad=True)
x

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [1., 2., 3.]], requires_grad=True)

In [91]:
y=torch.tensor([[1,2,3],[1,2,3],[1,2,3]],dtype=torch.float32,requires_grad=False)
y

tensor([[1., 2., 3.],
        [1., 2., 3.],
        [1., 2., 3.]])

### Building the forward path
Retaining grads for the first layer but not for second layer

In [92]:
w=x.mm(y) +x**3+3*x**2+2*x
w.retain_grad() #Forcing the layer to maintain the gradients of w


In [93]:
z= w**2+2*w #no requires_grad() so no maintaining of gradients for w

In [94]:
out=z.mean()
out

tensor(26450., grad_fn=<MeanBackward0>)

### Backward propogation

In [95]:
out.backward(retain_graph=True)

In [96]:
print(y.grad)

None


### Checking the gradients at every node i.e. x.grad is dout/dx; w.grad=dout/dw

In [98]:
x.grad

tensor([[  103.7778,   285.7778,   897.1111],
        [ 2628.4443,  6122.4443, 12785.7773],
        [  103.7778,   285.7778,   897.1111]])

In [99]:
w.grad

tensor([[ 2.8889,  8.2222, 17.5556],
        [30.2222, 53.5556, 84.8889],
        [ 2.8889,  8.2222, 17.5556]])

In [100]:
print(y.grad) #y.grad returns a None object because we set requires_grad() to False. So it returns a None object

None


In [101]:
z.grad #For z, we did not set retain grad(). Hence, it does not store do/dz i.e. z.grad

  """Entry point for launching an IPython kernel.


#### On initiating backward(), we also need to pass the parameter (retain_graph=True). Else if we retry the backprop, it will not have the details