In [6]:
# Coding examples from Learning PyTorch with Examples:  https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
# By Justin Johnson
# -*- coding: utf-8 -*-

import numpy as np
import torch

## Warm Up with Numpy

This example "use[s] numpy to fit a two-layer network to random data by manually implementing the forward and backward passes through the network using numpy operations".  The downside to Numpy is that it cannot utilize GPU for processing, which can provide more than 50x greater speeds. In this day in age, this just won't cut it.

This example fits a Numpy array to random data

In [8]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(20):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31934664.907030087
1 29262251.370950304
2 30550062.900464088
3 30775812.879441254
4 26664045.71094605
5 19078209.303244997
6 11372572.342969213
7 6163892.985219691
8 3345251.829693421
9 1972914.6740151828
10 1300782.4717671033
11 948175.8639766811
12 741833.878485787
13 606792.5871946111
14 509805.56756730226
15 435403.1724673868
16 375882.76025044604
17 326981.9962360215
18 286187.88615136244
19 251734.63199337316


## Enter, the Tensor

In PyTorch, the Tensor is the same concept as an n-dimensional array in Numpy. Just as with Numpy Arrays, "Tensors do not know anything about deep learning or computational graphs or gradients; they are a generic tool for scientific computing". However, unlike Numpy Arrays, Tensors can be set to run their computations on a GPU, significantly increasing their performance run time.

This example fits a Tensor to random data using the cpu

In [9]:
# -*- coding: utf-8 -*-

dtype = torch.float
device = torch.device("cpu")
# dtype = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(20):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29707844.0
1 24897342.0
2 22956510.0
3 20891364.0
4 17536918.0
5 13276195.0
6 9146043.0
7 5932256.0
8 3772372.25
9 2445405.0
10 1656964.125
11 1187660.0
12 898856.5
13 711649.4375
14 582671.125
15 488636.40625
16 416546.75
17 359188.5
18 312236.4375
19 273093.03125
