### Deep Learning Tool:

- A replacement for NumPy to use the power of GPUs
- Speed
- The biggest difference between the two is that TensorFlow’s computational graphs are static and PyTorch uses dynamic computational graphs.


In [17]:
#Tensors
import torch
x=torch.empty(5,3)
print(x)

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 1.4630e-42, 0.0000e+00],
        [0.0000e+00, 3.1786e-01, 0.0000e+00]])


In [18]:
x=torch.rand(5,3)
print(x)

tensor([[0.4510, 0.3780, 0.0409],
        [0.1885, 0.5373, 0.3761],
        [0.4619, 0.6069, 0.8293],
        [0.0044, 0.6458, 0.7222],
        [0.0360, 0.1425, 0.3986]])


In [19]:
x = torch.zeros(5, 3, dtype=torch.long)

In [20]:
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [21]:
x = torch.zeros(5, 3, dtype=torch.long)
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [22]:
y=torch.tensor([5.5,3])
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [25]:
print((x.size()))

torch.Size([5, 3])


In [27]:
r=torch.rand(5,3)
y=torch.rand(5,3)
print(r+y)

tensor([[1.6834, 0.1664, 1.2262],
        [0.2617, 1.0652, 0.9610],
        [0.4439, 1.0061, 1.1181],
        [0.6359, 0.6359, 1.2315],
        [1.2801, 0.8869, 0.4285]])


In [29]:
result = torch.empty(5, 3)
torch.add(r, y, out=result)
print(result)

tensor([[1.6834, 0.1664, 1.2262],
        [0.2617, 1.0652, 0.9610],
        [0.4439, 1.0061, 1.1181],
        [0.6359, 0.6359, 1.2315],
        [1.2801, 0.8869, 0.4285]])


In [31]:
y.add_(r)
print(y)

tensor([[1.6834, 0.1664, 1.2262],
        [0.2617, 1.0652, 0.9610],
        [0.4439, 1.0061, 1.1181],
        [0.6359, 0.6359, 1.2315],
        [1.2801, 0.8869, 0.4285]])


In [35]:
# Resize
y.view(16)

RuntimeError: shape '[16]' is invalid for input of size 15

In [40]:
#Converting a Torch Tensor to a NumPy Array
a=torch.ones(5)
a

tensor([1., 1., 1., 1., 1.])

In [43]:
b=a.numpy()
b

array([1., 1., 1., 1., 1.], dtype=float32)

In [44]:
#Converting NumPy Array to Torch Tensor
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [49]:
#Cuda Tensor
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]], device='cuda:0')
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)


## AUTOGRAD: AUTOMATIC DIFFERENTIATION
- The autograd package provides automatic differentiation for all operations on Tensors. It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different.

In [50]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

## NEURAL NETWORKS
- Neural networks can be constructed using the torch.nn package.
- Now that you had a glimpse of autograd, nn depends on autograd to define models and differentiate them. An nn.Module contains layers, and a method forward(input)that returns the output.

-A typical training procedure for a neural network is as follows:

- Define the neural network that has some learnable parameters (or weights)
- Iterate over a dataset of inputs
- Process input through the network
- Compute the loss (how far is the output from being correct)
- Propagate gradients back into the network’s parameters
- Update the weights of the network, typically using a simple update rule: weight = weight - learning_rate * gradient

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)


Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## LEARNING PYTORCH WITH EXAMPLES

In [53]:
# First with numpy
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28488330.17273628
1 23160107.485640302
2 21440473.825084373
3 20170631.96667608
4 17907018.94185795
5 14371937.665065382
6 10424199.277712353
7 6957917.080656442
8 4455364.230568311
9 2839839.644624836
10 1863542.3076728838
11 1281068.2590723503
12 929178.9098681512
13 707951.3410113093
14 561584.0934746913
15 459170.6687467587
16 383707.1668537024
17 325623.7394783383
18 279315.322160861
19 241519.33996148966
20 210147.97241601805
21 183732.84776705957
22 161284.31619827184
23 142088.96632008257
24 125606.85944487911
25 111383.27384966268
26 99036.72979089525
27 88269.73363597243
28 78856.16408410935
29 70601.47571865082
30 63341.99872060927
31 56938.75880459038
32 51279.29797709978
33 46265.75816106407
34 41814.67339789143
35 37855.05094837535
36 34325.357555395196
37 31175.76473511497
38 28356.539473770958
39 25827.84459744816
40 23556.60822320757
41 21514.120543008677
42 19673.441152657135
43 18012.04123963671
44 16510.52886747797
45 15151.671073780893
46 13920.539135400777
47 12

360 0.0025390312758252326
361 0.0024332215686139642
362 0.0023318782210032356
363 0.002234788392053818
364 0.002141776423947085
365 0.002052679074264272
366 0.001967323418787636
367 0.001885552007115511
368 0.001807216444319872
369 0.0017321810864582374
370 0.0016602807877082895
371 0.00159139297026871
372 0.001525395282014003
373 0.001462192847615378
374 0.0014016077115504707
375 0.0013435588214515952
376 0.0012879459996306855
377 0.0012346493121920897
378 0.0011835797124959586
379 0.001134646114904999
380 0.0010877553426372488
381 0.0010428212791113254
382 0.0009997669780462998
383 0.000958509429788452
384 0.0009189652737653444
385 0.0008810691301015428
386 0.0008447542779769586
387 0.0008099519242676911
388 0.0007766103404227148
389 0.0007446438528237425
390 0.0007140058966166885
391 0.0006846380103365884
392 0.0006564914596118989
393 0.00062951388202236
394 0.0006036547717253509
395 0.0005788682425347252
396 0.0005551143688681527
397 0.0005323408326354943
398 0.0005105108836353511


## PyTorch: Tensors
- Numpy is a great framework, but it cannot utilize GPUs to accelerate its numerical computations. For modern deep neural networks, GPUs often provide speedups of 50x or greater, so unfortunately numpy won’t be enough for modern deep learning.
- Also unlike numpy, PyTorch Tensors can utilize GPUs to accelerate their numeric computations. To run a PyTorch Tensor on GPU, you simply need to cast it to a new datatype.

In [54]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 520.3456420898438
199 2.1759235858917236
299 0.015723928809165955
399 0.00035596173256635666
499 5.817860801471397e-05


## PyTorch: Tensors and autograd

- In the above examples, we had to manually implement both the forward and backward passes of our neural network. Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.

- Thankfully, we can use automatic differentiation to automate the computation of backward passes in neural networks. The autograd package in PyTorch provides exactly this functionality. When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

In [55]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 710.630615234375
199 3.054933547973633
299 0.020029891282320023
399 0.0003448449424467981
499 4.533149331109598e-05


## PyTorch: Defining new autograd functions

- In PyTorch we can easily define our own autograd operator by defining a subclass of torch.autograd.Function and implementing the forward and backward functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data.

- In this example we define our own custom autograd function for performing the ReLU nonlinearity, and use it to implement our two-layer network: