# learn_pytorch_2

https://github.com/jcjohnson/pytorch-examples#pytorch-control-flow--weight-sharing

### 1. numpy
simple example of computing loss and update the pparameters

In [2]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)   # x: 64*1000
y = np.random.randn(N, D_out)  # y: 64*10 
y

array([[ 7.70349673e-01,  1.79280391e-01, -6.96710162e-01,
        -1.14228180e-01, -3.08002866e-01,  6.85322422e-01,
         1.45770309e-01, -6.66895425e-01, -1.01443969e+00,
        -5.78587595e-01],
       [-2.01750704e+00,  1.66101096e+00,  4.98564741e-01,
         1.22624533e+00, -1.70643765e-01,  1.04489762e+00,
        -6.19827305e-01, -8.43647291e-01,  5.75757094e-01,
        -1.01830699e+00],
       [ 8.09649362e-01,  5.71259032e-01, -1.19817443e+00,
         5.08620704e-01, -1.18922211e-01,  5.14077119e-01,
        -4.98174729e-02,  5.51777947e-01,  3.50358282e-01,
         9.87681029e-03],
       [-2.13125139e+00, -3.86758697e-01, -1.71001802e+00,
        -2.15537480e-01, -2.65420166e-01,  1.46555331e+00,
         4.74331893e-01,  5.04207708e-01,  3.68349631e-01,
        -4.26210365e-01],
       [ 5.85489836e-01, -6.79339975e-01,  2.17388526e-01,
         5.79294184e-01, -1.00830019e+00,  4.70337307e-01,
        -6.40373155e-01,  5.78902549e-01, -3.45054749e+00,
        -1.

In [3]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)  # w1: 1000*100 
w2 = np.random.randn(H, D_out) # w2: 100*10 
w1

array([[-0.19794508,  1.7146676 ,  0.33770145, ..., -0.65731461,
        -0.08338042, -1.7265136 ],
       [ 0.18382642, -2.05639795,  1.43849016, ..., -0.40818699,
         2.38534632,  0.97114409],
       [-0.22319933, -0.86668278,  1.15879985, ..., -0.85409537,
         2.19809992, -2.07591792],
       ...,
       [ 0.95420475,  0.12119237,  0.10893154, ...,  1.29326188,
        -0.24381702,  0.59580941],
       [-0.65865732,  0.61537999, -1.3215875 , ...,  0.60836248,
        -1.24109654, -0.53255857],
       [-0.49592592, -1.63950215,  1.18904836, ..., -0.26110635,
         0.53750091,  0.80991245]])

In [4]:
learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y  
  h = x.dot(w1)  #   x: 64*1000   w1: 1000*100  h: 64*100
  h_relu = np.maximum(h, 0)   # h_relu: 64*100  see below
  y_pred = h_relu.dot(w2)  # h_relu: 64*100, w2: 100*10  y_pred: 64*10 
  
  # Compute and print loss
  loss = np.square(y_pred - y).sum()  # loss is l2 distance between pred and truth
  print(t, loss)   # the loss id decreasing as t grows 
  
  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)  # h_relu.T: 100*64,  grad_y_pred: 64*10    grad_w2: 100*10
  grad_h_relu = grad_y_pred.dot(w2.T)  # grad_h_relu: 64*100
  grad_h = grad_h_relu.copy()   # grad_h: 64*100
  grad_h[h < 0] = 0
  grad_w1 = x.T.dot(grad_h)  # grad_w1: 1000*100
 
  # Update weights
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 35178433.61696746
1 32620380.320239883
2 33263417.114559308
3 31032198.911823377
4 24040336.438538853
5 15098615.766270481
6 8230408.230059323
7 4357294.872738287
8 2508483.29369117
9 1639092.9661269647
10 1198270.7507276158
11 942664.7889935717
12 773082.3105497458
13 648732.3293901454
14 551817.6736236213
15 473544.7604183501
16 409019.36447082256
17 355139.8282096833
18 309724.927521129
19 271209.67033188965
20 238350.48561374773
21 210158.8148721893
22 185872.28206784825
23 164887.90839093397
24 146655.5810265264
25 130778.03118464816
26 116902.06184717942
27 104724.50842266085
28 94013.34230981107
29 84591.16720518943
30 76260.39791630031
31 68879.68314079306
32 62323.97013481367
33 56489.5434437861
34 51280.362942743624
35 46622.9902986083
36 42484.52154533852
37 38775.7095908881
38 35437.3936772661
39 32428.189130234412
40 29709.55205353855
41 27251.69769170634
42 25025.02579047752
43 23006.261367099294
44 21172.384538535523
45 19503.067802882666
46 17982.82607923039
47 16596.

417 0.00014930652834844272
418 0.0001428133142933717
419 0.00013660359180027478
420 0.0001306653488626499
421 0.00012498740638429098
422 0.0001195540653968683
423 0.00011435789066417053
424 0.00010938920294928259
425 0.00010463641261506822
426 0.00010009133692176798
427 9.574382396674097e-05
428 9.158558545076386e-05
429 8.760928730340498e-05
430 8.380591524785286e-05
431 8.016822557356092e-05
432 7.669027254401557e-05
433 7.336278359448741e-05
434 7.017927184316342e-05
435 6.713552609798038e-05
436 6.422344995385095e-05
437 6.143797765864524e-05
438 5.8773970319902567e-05
439 5.622577828913395e-05
440 5.37891117053479e-05
441 5.145764941865652e-05
442 4.922816990001467e-05
443 4.709542390546338e-05
444 4.5055102512540674e-05
445 4.310333908873505e-05
446 4.123685002090657e-05
447 3.9450921399565346e-05
448 3.7742727350850636e-05
449 3.6108823660941996e-05
450 3.454550873437591e-05
451 3.3050459089736154e-05
452 3.1620464284172643e-05
453 3.0252395537140067e-05
454 2.8943635505906946e-

#### np.maximum and np.max    
##### &ensp; np.max：(a, axis=None, out=None, keepdims=False)     
&ensp;&ensp;求序列的最值   
&ensp;&ensp;最少接收一个参数   
&ensp;&ensp;axis：默认为列向（也即 axis=0），axis = 1 时为行方向的最值；   

##### &ensp;np.maximum：(X, Y, out=None)   
&ensp;&ensp;X 与 Y 逐位比较取其大者；   
&ensp;&ensp;最少接收两个参数

In [5]:
np.max([-2, -1, 0, 1, 2])

2

In [6]:
np.maximum([-2, -1, 0, 1, 2], 0)

array([0, 0, 0, 1, 2])

### 2. tensors
almost same, except for changing some names of operaitons   
但是暂时没法运行，问题应该是 jupyter 模式下的编译路径不同。。

In [None]:
# Code in file tensor/two_layer_net_tensor.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)

  # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
  # of shape (); we can get its value as a Python number with loss.item().
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

  # Update weights using gradient descent
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

### 3. Autograd     
可以自动向后求导（反向求导一句话解决）

###### before：

In [None]:
# Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

###### after：

In [None]:
# Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Tensors with requires_grad=True.
  # After this call w1.grad and w2.grad will be Tensors holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

### 4. Defining new autograd functions

可以自己定义并且call一下，大致意思明白，具体appy函数，以及传递的参数，可以再看看|

### 5. Static Graphs

主要是和tf的比较

### 6. nn   
对于大的神经网络，autograd可能不够用    
In TensorFlow, packages like Keras, TensorFlow-Slim, and TFLearn provide higher-level abstractions over raw computational graphs that are useful for building neural networks.

In PyTorch, the nn package serves this same purpose.

In [None]:
# Code in file nn/two_layer_net_nn.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        ).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Tensor of input data to the Module and it produces
  # a Tensor of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Tensors containing the predicted and true
  # values of y, and the loss function returns a Tensor containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Tensors with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Tensor, so
  # we can access its data and gradients like we did before.
  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad

可使用多种nn中已有的层的类型函数和求loss的函数

### 7. optim
The optim package in PyTorch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.    
对更新权重的算法的优化，在一个包里

### 8. Custom nn Modules    
you can define your own Modules by subclassing nn.Module and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.   
可以自己定义一些复杂的模型

In [None]:
# Code in file nn/two_layer_net_module.py
import torch

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.
    """
    super(TwoLayerNet, self).__init__()   # super是调用父类成员的一种方法
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    In the forward function we accept a Tensor of input data and we must return
    a Tensor of output data. We can use Modules defined in the constructor as
    well as arbitrary (differentiable) operations on Tensors.
    """
    h_relu = self.linear1(x).clamp(min=0)
    y_pred = self.linear2(h_relu)
    return y_pred

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above.
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
loss_fn = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

### 9. Control Flow + Weight Sharing
As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.   
可以很方便实现一个随机层数的奇怪的model