# Torch Study
## Three main features of PyTorch
 - Tensor: similar to numpy.array but can run on GPUs
 - Autograd: automatic differentiation for all operations on Tensors
 - NN: nn.module, framework to build neural network easily

## Materials
 - [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)
 - [PyTorch Examples](https://github.com/jcjohnson/pytorch-examples)

## Examples
 - 3 Layers Neural Network
   - Input Layer: 1000 neurons
   - Hidden Layer: 100 neurons
      - ReLU Activation Function
   - Output Layer: 10 neurons
- Training Data: 100 samples
   - Learning Rate: 1e-6
   - Training Iterations: 500

In [None]:
# Numpy Version
import numpy as np

N, D_in, D_h, D_out = 100, 1000, 100, 1

# generate the training data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# init the weights
w1 = np.random.randn(D_in, D_h)
w2 = np.random.randn(D_h, D_out)

learning_rate = 1e-6

for t in range(500):
    # forward inference
    h = x.dot(w1)                                                           # N, D_h
    h_relu = np.maximum(h, 0)                                   # N, D_h
    y_pred = h_relu.dot(w2)                                         # N, D_out

    # calculate the loss
    loss = np.square(y_pred - y).sum()                      # scalar
    print(t, loss)

    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)                                        # N, D_out
    grad_w2 = h_relu.T.dot(grad_y_pred)                                 # D_h, D_out
    grad_h_relu = grad_y_pred.dot(w2.T)                                 # N, D_h
    grad_h = grad_h_relu.copy()                                             # N, D_h
    grad_h[h < 0] = 0                                                               # N, D_h
    grad_w1 = x.T.dot(grad_h)                                               # D_in, D_h

    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
# Tensor Version
import torch

# cpu or cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate the training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device)
w2 = torch.randn(D_h, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # forward inference
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
# Autograd Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, requires_grad=True)

# training
learning_rate = 1e-6
for t in range(500):
    # forward inference
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # use autograd to do the back-propagation
    loss.backward()

    # update the weights, prevent torch do autograd for this part
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # zero grads
        w1.grad.zero_()
        w2.grad.zero_()

In [None]:
# MyReLU Version
import torch
from my_relu import MyReLU

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, requires_grad=True)

# training
learning_rate = 1e-6
for t in range(500):
    # forward inference, with MyReLU
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)

    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # use autograd to do the back-propagation
    # gonna call MyReLU.backward()
    loss.backward()

    # update the weights, prevent torch do autograd for this part
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # zero grads
        w1.grad.zero_()
        w2.grad.zero_()

In [None]:
# NN Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# define the model
model = torch.nn.Sequential(            # 创建一个顺序模型容器
    torch.nn.Linear(D_in, D_h),             # 添加第一层全连接层，输入维度1000，输出维度100
    torch.nn.ReLU(),                                # 添加 ReLU 激活函数层
    torch.nn.Linear(D_h, D_out)             # 添加第二层全连接层，输入维度100，输出维度10
).to(device)

# define the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')             # 定义均方误差损失函数，使用求和方式计算

# training, use bigger learning rate
learning_rate = 1e-4
for t in range(500):
    # forward inference
    y_pred = model(x)

    # calculate the loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # back-propagation
    model.zero_grad()
    loss.backward()

    # update the model parameters(weights)
    with torch.no_grad():
        for param in model.parameters():                    #  遍历模型中的所有参数
            param -= learning_rate * param.grad             # 使用梯度下降法更新参数：参数 = 参数 - 学习率 × 梯度

In [1]:
# Custom Module Version
import torch
from my_nn import TwoLayerNet

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# define the model
model = TwoLayerNet(D_in, D_h, D_out).to(device)

# define the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

# training, use bigger learning rate
learning_rate = 1e-4
for t in range(500):
    # forward inference
    y_pred = model(x)

    # calculate the loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # back-propagation
    model.zero_grad()
    loss.backward()

    # update the model parameters(weights)
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 1005.9903564453125
1 929.0047607421875
2 863.9293823242188
3 807.3326416015625
4 757.4134521484375
5 712.4595947265625
6 671.5608520507812
7 633.9004516601562
8 598.9429321289062
9 566.4400634765625
10 535.914794921875
11 507.2608337402344
12 479.9627685546875
13 454.00067138671875
14 429.159912109375
15 405.43109130859375
16 382.64862060546875
17 360.8590087890625
18 340.0908203125
19 320.2925109863281
20 301.37420654296875
21 283.4112548828125
22 266.4350891113281
23 250.31593322753906
24 235.05264282226562
25 220.65667724609375
26 207.07933044433594
27 194.26718139648438
28 182.20199584960938
29 170.85586547851562
30 160.17042541503906
31 150.13290405273438
32 140.72592163085938
33 131.90960693359375
34 123.63257598876953
35 115.87843322753906
36 108.61011505126953
37 101.78626251220703
38 95.39813995361328
39 89.419189453125
40 83.84007263183594
41 78.62237548828125
42 73.74696350097656
43 69.18798828125
44 64.92525482177734
45 60.9373664855957
46 57.20482635498047
47 53.71139907