## PyTorch: Autograd

A PyTorch Variable is a node in a computational graph
* x.data is a Tensor
* x.grad is a Variable of gradients(same shape as x.data)
* x.grad.data is a Tensor of gradients

In [1]:
import torch
from torch.autograd import Variable

In [3]:
N, D_in, H, D_out = 64, 1000, 100, 10

# PyTorch Tensors and Variables have the same APIs
# Variables remember how they were created(for backprop)
# requires_grad=False: We will not want gradients of loss with respect to data
# requires_grad=True: Do want gradients with respect to data
x = Variable(torch.randn(N, D_in), requires_grad=False)
y = Variable(torch.randn(N, D_out), requires_grad=False)
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
w2 = Variable(torch.randn(H, D_out), requires_grad=True)

In [10]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass looks exactly the same as the Tensor version,
    # but everything is a variable now
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    
    # Compute gradients of loss with respect to w1 and w2(zero out grads first)
    if hasattr(w1, 'grad'): w1.grad.data.zero_()
    if hasattr(w2, 'grad'): w2.grad.data.zero_()
    loss.backward()
    
    # Make gradient descent step on weights
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data