PyTorch Introduction

Part 1: Simple 2-layer Neural Network with Numpy

In [0]:
import numpy as np

In [0]:
M, D_in, H, D_out = 64,1000,100,10
# M is batch size, H is hidden layer
# D_in is input dimensions, D_out is output dimensions

In [0]:
# Creating random input data
x = np.random.randn(M, D_in)
y = np.random.randn(M, D_out)

In [0]:
# Random weight initialization
w1 = np.random.randn(D_in, H)
w2 = np.random.rand(H, D_out)

In [0]:
learning_rate = 1e-6
for t in range(500):
  # Forward Pass
  h = x.dot(w1)
  h_relu = np.maximum(h, 0) #ReLU activation
  y_pred = h_relu.dot(w2)
  
  # Loss Calculation
  loss = np.square(y_pred - y).sum()
  print(t, loss)
  
  # Backpropogation for Gradient Calculation
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)
  grad_h_relu = grad_y_pred.dot(w2.T)
  grad_h = grad_h_relu.copy()
  grad_h[h<0] = 0
  grad_w1 = x.T.dot(grad_h)
  
  #Gradient Update
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2
  

0 265602416.49789023
1 194876675.06837878
2 6579227.193580503
3 1204930.9571036925
4 603671.5639759166
5 481836.7977267602
6 422687.194061568
7 377274.0888024503
8 338816.6012423005
9 305642.3792991593
10 276841.62182270095
11 251721.1281765333
12 229716.1914972036
13 210360.61928955244
14 193268.76395770826
15 178117.8938230524
16 164637.4459833201
17 152600.04246042413
18 141814.41916093134
19 132118.65555273567
20 123375.16567518748
21 115466.73500257412
22 108291.16170426011
23 101762.43800167326
24 95806.12319514062
25 90357.48547177091
26 85361.23775593509
27 80768.77119314129
28 76537.70638768845
29 72631.20313609758
30 69016.76350415945
31 65666.343943712
32 62554.39715881121
33 59658.44429438087
34 56958.76951660212
35 54437.87197296563
36 52080.383517153175
37 49872.10918468415
38 47800.47152893711
39 45854.37188482243
40 44023.72820850794
41 42299.529581347844
42 40673.520754304445
43 39138.363853390125
44 37687.4516540474
45 36314.55891046632
46 35014.08121449092
47 33780.8

Part 2: Using Pytorch( Autograg, torch.nn and optim)

In [0]:
import torch

In [0]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU


In [0]:
N, D_in, H, D_out = 64,1000,100,10
# M is batch size, H is hidden layer
# D_in is input dimensions, D_out is output dimensions

In [0]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [0]:
model = torch.nn.Sequential(
  torch.nn.Linear(D_in, H),
  torch.nn.ReLU(),
  torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction = 'sum')


In [0]:
learning_rate = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
  y_pred = model(x)
  
  # Compute and print Loss
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  optimizer.zero_grad() # Zero all gradients
  loss.backward() # Backward Pass
  optimizer.step() # Update Parameters

0 672.6079711914062
1 672.434814453125
2 672.26171875
3 672.088623046875
4 671.9155883789062
5 671.7425537109375
6 671.5695190429688
7 671.3971557617188
8 671.224853515625
9 671.052734375
10 670.880615234375
11 670.7085571289062
12 670.5365600585938
13 670.3645629882812
14 670.192626953125
15 670.0206909179688
16 669.848876953125
17 669.6771240234375
18 669.5054321289062
19 669.333740234375
20 669.162109375
21 668.990478515625
22 668.8189086914062
23 668.6473388671875
24 668.47607421875
25 668.3048095703125
26 668.1334838867188
27 667.962158203125
28 667.7908325195312
29 667.61962890625
30 667.4483642578125
31 667.2772216796875
32 667.1060791015625
33 666.9349365234375
34 666.763916015625
35 666.5928344726562
36 666.4217529296875
37 666.2507934570312
38 666.0800170898438
39 665.9092407226562
40 665.738525390625
41 665.5678100585938
42 665.3970947265625
43 665.2264404296875
44 665.0558471679688
45 664.88525390625
46 664.7147216796875
47 664.5442504882812
48 664.373779296875
49 664.20336