# Torch Study
## Three main features of PyTorch
 - Tensor: similar to numpy.array but can run on GPUs
 - Autograd: automatic differentiation for all operations on Tensors
 - NN: nn.module, framework to build neural network easily

## Materials
 - [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)
 - [PyTorch Examples](https://github.com/jcjohnson/pytorch-examples)

## Examples
 - 3 Layers Neural Network
   - Input Layer: 1000 neurons
   - Hidden Layer: 100 neurons
      - ReLU Activation Function
   - Output Layer: 10 neurons
- Training Data: 100 samples
   - Learning Rate: 1e-6
   - Training Iterations: 500

In [1]:
# Numpy Version
import numpy as np

N, D_in, D_h, D_out = 100, 1000, 100, 1

# generate the training data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# init the weights
w1 = np.random.randn(D_in, D_h)
w2 = np.random.randn(D_h, D_out)

learning_rate = 1e-6

for t in range(500):
    # forward inference
    h = x.dot(w1)                                                           # N, D_h
    h_relu = np.maximum(h, 0)                                   # N, D_h
    y_pred = h_relu.dot(w2)                                         # N, D_out

    # calculate the loss
    loss = np.square(y_pred - y).sum()                      # scalar
    print(t, loss)

    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)                                        # N, D_out
    grad_w2 = h_relu.T.dot(grad_y_pred)                                 # D_h, D_out
    grad_h_relu = grad_y_pred.dot(w2.T)                                 # N, D_h
    grad_h = grad_h_relu.copy()                                             # N, D_h
    grad_h[h < 0] = 0                                                               # N, D_h
    grad_w1 = x.T.dot(grad_h)                                               # D_in, D_h

    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 7786461.329196663
1 31653739.27209103
2 174139653.20773974
3 748611884.9003179
4 660017659.67774
5 1409401.1021944047
6 1094314.3589107117
7 861348.2275402917
8 686124.1930547106
9 552430.196416511
10 449261.62310414284
11 368991.71385069535
12 306016.8931984958
13 256270.6603296833
14 216707.94649342413
15 185036.1500692812
16 159532.07268017298
17 138888.4788328411
18 122055.51839409303
19 108245.90365259226
20 96853.06345443486
21 87369.27764761692
22 79414.02051803091
23 72689.07021809494
24 66959.97876360305
25 62040.055094904135
26 57779.91969219708
27 54061.90806452672
28 50790.89877960367
29 47892.04253141788
30 45305.86945023097
31 42981.5948964014
32 40878.544284637544
33 38964.76887245237
34 37214.02403589538
35 35604.03251395711
36 34117.11497562927
37 32738.61810156912
38 31455.84771363692
39 30258.287984807408
40 29136.62211627128
41 28083.65250147671
42 27093.315229974425
43 26159.067981556687
44 25276.00210460947
45 24440.0222138235
46 23647.367466968175
47 22894.6803

In [1]:
# Tensor Version
import torch

# cpu or cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate the training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device)
w2 = torch.randn(D_h, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # forward inference
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 49954264.0
1 125549504.0
2 495385408.0
3 767501824.0
4 15511727.0
5 9328604.0
6 6337955.0
7 4604965.0
8 3498550.0
9 2746234.0
10 2212082.0
11 1820547.25
12 1525997.625
13 1299669.5
14 1122867.5
15 982837.0625
16 870294.9375
17 778595.75
18 702857.9375
19 639709.5625
20 586447.0625
21 541004.0625
22 501734.375
23 467522.0
24 437451.0625
25 410769.6875
26 386917.875
27 365490.1875
28 346132.1875
29 328551.21875
30 312489.9375
31 297740.15625
32 284136.90625
33 271539.75
34 259829.8125
35 248911.09375
36 238714.609375
37 229162.71875
38 220210.796875
39 211790.296875
40 203855.09375
41 196367.96875
42 189292.078125
43 182593.71875
44 176248.203125
45 170229.734375
46 164520.1875
47 159102.53125
48 153959.5
49 149060.171875
50 144387.203125
51 139928.796875
52 135662.375
53 131582.46875
54 127685.3359375
55 123967.015625
56 120404.5859375
57 116995.609375
58 113724.9453125
59 110587.296875
60 107573.4375
61 104681.515625
62 101906.1328125
63 99235.0234375
64 96667.15625
65 94195.390625
6

In [2]:
# Autograd Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, requires_grad=True)

# training
learning_rate = 1e-6
for t in range(500):
    # forward inference
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # use autograd to do the back-propagation
    loss.backward()

    # update the weights, prevent torch do autograd for this part
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # zero grads
        w1.grad.zero_()
        w2.grad.zero_()

0 tensor(40031508., grad_fn=<SumBackward0>)
1 tensor(71834512., grad_fn=<SumBackward0>)
2 tensor(3.2308e+08, grad_fn=<SumBackward0>)
3 tensor(1.0019e+09, grad_fn=<SumBackward0>)
4 tensor(1.6766e+08, grad_fn=<SumBackward0>)
5 tensor(26556596., grad_fn=<SumBackward0>)
6 tensor(15460080., grad_fn=<SumBackward0>)
7 tensor(10649197., grad_fn=<SumBackward0>)
8 tensor(7943660., grad_fn=<SumBackward0>)
9 tensor(6229142., grad_fn=<SumBackward0>)
10 tensor(5051521.5000, grad_fn=<SumBackward0>)
11 tensor(4200188., grad_fn=<SumBackward0>)
12 tensor(3566603.5000, grad_fn=<SumBackward0>)
13 tensor(3075790., grad_fn=<SumBackward0>)
14 tensor(2683527.5000, grad_fn=<SumBackward0>)
15 tensor(2365857.2500, grad_fn=<SumBackward0>)
16 tensor(2105503.7500, grad_fn=<SumBackward0>)
17 tensor(1887601.5000, grad_fn=<SumBackward0>)
18 tensor(1703584.8750, grad_fn=<SumBackward0>)
19 tensor(1547180.7500, grad_fn=<SumBackward0>)
20 tensor(1411691.7500, grad_fn=<SumBackward0>)
21 tensor(1293940.6250, grad_fn=<SumBac