In [None]:
import torch

#Problem 1

### $$f(X) =  \frac{\|X\|_2^2}{n}$$, where $X\in \mathbb{R}^n$.
### Find the gradient $f_X$, which is analytically $\frac{2x}{n}$.


### Generally, there are two ways:


1.   backward()
2.   autograd.grad()




In [None]:
# n = 10
X = torch.randn(10,requires_grad=True)
Y = torch.square(X) # the same as X**2
Ym = torch.mean(Y)
print(X)
print(Y)
print(Ym)

tensor([ 2.2156, -0.2405, -0.1481, -0.0384, -0.5599,  0.8631,  0.1970,  0.9722,
        -0.0123, -0.2951], requires_grad=True)
tensor([4.9088e+00, 5.7862e-02, 2.1948e-02, 1.4720e-03, 3.1347e-01, 7.4499e-01,
        3.8795e-02, 9.4508e-01, 1.5204e-04, 8.7070e-02],
       grad_fn=<PowBackward0>)
tensor(0.7120, grad_fn=<MeanBackward0>)


### 1. backward()

In [None]:
Ym.backward(retain_graph=True)
print(X.grad)
X.grad.zero_()
print()

tensor([ 0.4431, -0.0481, -0.0296, -0.0077, -0.1120,  0.1726,  0.0394,  0.1944,
        -0.0025, -0.0590])



### 2. torch.autograd.grad()

In [None]:
X_grad = torch.autograd.grad(Ym,X,create_graph = True,retain_graph = True)[0]
print(X_grad)

tensor([ 0.4431, -0.0481, -0.0296, -0.0077, -0.1120,  0.1726,  0.0394,  0.1944,
        -0.0025, -0.0590], grad_fn=<MulBackward0>)


* `retain_graph = True` allows us to keep the hidden computational graph from deleting by `backward()` so that we can run this line again and again.

* `create_graph = True` is used when we want to do further operations on gradients, so that the autograd engine can create a backpropable graph for operations done on gradients.

* `grad.zero_()` avoid us to calculate the gradient accumulatively when we run this block again

* `torch.autograd.grad()` does not have issue of accumulative gradient


# Problem 2

### $$F(A,X) = AX$$, where $A\in\mathbb{R}^{m\times n}$ and $X\in\mathbb{R}^{n\times p}$

### Find the gradient $F_A$ and $F_X$

### By Matrix Calculus, the gradient should be in form of tensor, but what pytorch is calculating is acutally the gradient of $J = (\sum F_{ij})$.

### $J_A$ and $J_X$ has the same shape as $A$ and $X$, respectively, and we have $$J = \sum_{i,j}F_{ij} = \sum_{i,j}\sum_{k}A_{ik}X_{kj}$$

### Hence, we would expect that $$(J_A)_{ql} = \dfrac{dJ}{dA_{ql}} = \sum_{i,j}\sum_{k}\dfrac{dA_{ik}}{dA_{ql}}X_{kj} = \sum_j X_{lj}$$, which is the sum of $l$-th row of $X$ and is independent of $q$, so for each column of $J_A$, the components are the same.

### Similarly, $$(J_X)_{ql} = \dfrac{dJ}{dX_{ql}} = \sum_{i,j}\sum_{k}A_{ik}\dfrac{dX_{kj}}{dX_{ql}} = \sum_i A_{iq}$$, which is the sum of $q$-th column of $A$ and is independent of $l$, so for each row of $J_X$, the components are the same.

In [None]:
A = torch.randn(4,3,requires_grad=True)
X = torch.randn(3,2,requires_grad=True)
F = torch.matmul(A,X) # the same as A@X

print(A)
print(X)
print(F)

tensor([[-1.4157, -0.5494, -0.3957],
        [-0.2589,  0.4123,  0.0835],
        [-1.2110,  1.4903, -1.2146],
        [ 0.2153,  1.4981, -0.0765]], requires_grad=True)
tensor([[ 1.3992,  1.1037],
        [ 1.5777, -0.2932],
        [ 0.6247, -0.8932]], requires_grad=True)
tensor([[-3.0949, -1.0479],
        [ 0.3404, -0.4812],
        [-0.1021, -0.6886],
        [ 2.6170, -0.1333]], grad_fn=<MmBackward0>)


### 1. backward()

In [None]:
F.backward(torch.ones_like(F),retain_graph=True)

print(A.grad)
print(X.grad)
A.grad.zero_()
X.grad.zero_()
print()

tensor([[ 2.5029,  1.2845, -0.2685],
        [ 2.5029,  1.2845, -0.2685],
        [ 2.5029,  1.2845, -0.2685],
        [ 2.5029,  1.2845, -0.2685]])
tensor([[-2.6703, -2.6703],
        [ 2.8512,  2.8512],
        [-1.6033, -1.6033]])



### 2. autograd.grad()

In [None]:
A_grad = torch.autograd.grad(F,A, grad_outputs= torch.ones_like(F),create_graph= True, retain_graph = True)[0]
X_grad = torch.autograd.grad(F,X, grad_outputs= torch.ones_like(F),create_graph= True, retain_graph = True)[0]

print(A_grad)
print(X_grad)

tensor([[ 2.5029,  1.2845, -0.2685],
        [ 2.5029,  1.2845, -0.2685],
        [ 2.5029,  1.2845, -0.2685],
        [ 2.5029,  1.2845, -0.2685]], grad_fn=<MmBackward0>)
tensor([[-2.6703, -2.6703],
        [ 2.8512,  2.8512],
        [-1.6033, -1.6033]], grad_fn=<MmBackward0>)


* eventhough both functions are calculating the gradient of $J$, a scalar, our $F$ is defined as matrix, so we need to specify that by using `torch.ones_like(F)`, otherwise, error will be returned.

* we will get the same result if use `F.sum()` to replace `F`

# Problem 3

###$$f(x) = x^3$$

### find the second derivartive of this function

In [None]:
x = torch.tensor(3.0,requires_grad=True)
f = x**3

print(x)
print(f)

tensor(3., requires_grad=True)
tensor(27., grad_fn=<PowBackward0>)


### 1. backward()

In [None]:
f.backward(create_graph=True,retain_graph=True)
f_prime = x.grad * torch.tensor(1.0)
x.grad.zero_()
f_prime.backward(retain_graph=True)
print(x.grad)
x.grad.zero_()
print()

tensor(18., grad_fn=<ZeroBackward0>)



  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



* if `f_prime = x.grad`, then the result is the addition of the first detivative and the second derivative.
  * `f_prime` is **sharing** value with `x.grad`, so when we use `x.grad.zero_()` to avoid the addition of two derivaitves, the value of `f_prime` is also changed to zero, which is not wanted.


* This multiplication with `torch.tensor(1.0)` does not change the value but establishes a computational graph that connects the `f_prime` with the first derivative, and `f_prime` is not sharing value with `x.grad` anymore.



### 2. autograd.grad()

In [None]:
f_prime = torch.autograd.grad(f,x,create_graph= True, retain_graph = True)[0]
f_prime_prime = torch.autograd.grad(f_prime,x,retain_graph=True)[0]
print(f_prime_prime)

tensor(18.)


In [None]:
x = torch.rand(5, requires_grad=True)
h = torch.exp(x).sum()
h.backward()  # First backward pass
x.grad.zero_()  # Reset gradients to zero
x.grad += torch.ones_like(x)  # Manually setting gradients
h = torch.exp(x).sum()
h.backward()  # Second backward pass
print(x.grad)  # Gradient afte

tensor([3.3913, 2.0521, 2.1245, 2.3662, 2.8886])


In [None]:
print(torch.exp(x))

tensor([2.3913, 1.0521, 1.1245, 1.3662, 1.8886], grad_fn=<ExpBackward0>)
