## Gradient Descent with Pytorch: define variables and autograd 


References: https://github.com/jcjohnson/pytorch-examples


In [None]:
import torch
from torch.autograd import Variable

In [21]:
dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 100, 100, 10

# create random input and output
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# intialize
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

lr = 1e-5
T = 500

for t in range(T):
    
    # forward
    h=x.mm(w1)
    h_relu=h.clamp(min=0)
    y_pred=h_relu.mm(w2)
    
    # loss
    loss = (y_pred - y).pow(2).sum()
    
    if t%100 == 0:
        print(t, loss.data[0])
    
    #backward using autograd, computing the gradient wrt all variables requires_grad=True
    loss.backward()
    
    w1.data -= lr*w1.grad.data
    w2.data -= lr*w2.grad.data
    
    # zero out gradient after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    

(0, 2942884.75)
(100, 173.30369567871094)
(200, 3.2803146839141846)
(300, 0.11890672147274017)
(400, 0.005051522050052881)


### Build models using __nn__ module

In [19]:
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(D_in, H),
    nn.ReLU(),
    nn.Linear(H, D_out),
)

loss_fn = nn.MSELoss(size_average=False)

In [25]:
lr = 1e-4
T = 500

for t in range(T):
    # forward
    y_pred = model(x)
    # loss
    loss = loss_fn(y_pred, y)

    if t%100 == 0:
        print(t, loss.data[0])    
    
    model.zero_grad()
    loss.backward()
    
    for p in model.parameters():
        p.data -= lr * p.grad.data
    

(0, 8.97107388375673e-12)
(100, 9.212602902763933e-12)
(200, 8.818723529202543e-12)
(300, 8.562300174430604e-12)
(400, 8.671823675809875e-12)


### Optimization using optim module

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [63]:
for t in range(500):
    # forward 
    y_pred = model(x)

    # loss
    loss = loss_fn(y_pred, y)
    if t%100 == 0:
        print(t, loss.data[0]) 

    optimizer.zero_grad()

    # backward 
    loss.backward()

    # Calling the step function on an Optimizer makes an update
    optimizer.step()

(0, 0.001861593802459538)
(100, 2.819776454998646e-06)
(200, 8.927323463803205e-11)
(300, 1.1604898465800151e-11)
(400, 1.3015999736354367e-11)


### Dynamic Computational Graph

Define a forward function with random number of forward passing

In [74]:
import random

# Define Dynamic Neural Nets via overriding nn.Module
class DN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DN, self).__init__()
        self.IH = nn.Linear(D_in, H)
        self.HH = nn.Linear(H, H)
        self.HO = nn.Linear(H, D_out)
    def forward(self, x):
        h_relu = self.IH(x).clamp(min=0)
        # random.randint(10,10)
        for _ in range(0,5):
            h_relu = self.HH(h_relu).clamp(min=0)
        y_pred = self.HO(h_relu)
        return y_pred
    
H = 2
dn = DN(D_in, H, D_out)
criterion = nn.MSELoss(size_average=False)
optimizer = torch.optim.Adam(dn.parameters(), lr=lr)

for t in range(500):
    
    # forward
    y_pred = dn(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t%100 == 0:
        print(t, loss.data[0])
        
#         p_list = dn.parameters()
#         for p in dn.parameters():
#             size = p.data.size()
#             print size
        
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

(0, 797.9113159179688)
(100, 791.23046875)
(200, 785.10205078125)
(300, 779.4408569335938)
(400, 774.1824340820312)
