In [1]:
import torch
import numpy as np

In [2]:
#With random or constant values:
#shape is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.

shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

Random Tensor: 
 tensor([[0.5572, 0.0545, 0.0020],
        [0.5545, 0.4033, 0.9121]]) 

Ones Tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 

Zeros Tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [3]:
#Tensor attributes 
tensor = torch.rand(3,4)
print(tensor.shape)
print(tensor.dtype)
print(tensor.device)

torch.Size([3, 4])
torch.float32
cpu


In [4]:
#Tensor Operations
# https://pytorch.org/docs/stable/torch.html

#Standard numpy-like indexing and slicing:
tensor = torch.ones(4, 4)
tensor[:,1] = 0
print(tensor)

#Join tensor
t1 = torch.cat([tensor, tensor, tensor], dim=1)
print(t1)

tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])
tensor([[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.]])


In [5]:
#Multiplying tensors
# This computes the element-wise product
print(f"tensor.mul(tensor) \n {tensor.mul(tensor)} \n")
# Alternative syntax:
print(f"tensor * tensor \n {tensor * tensor}")

tensor.mul(tensor) 
 tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]]) 

tensor * tensor 
 tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])


In [6]:
#This computes the matrix multiplication between two tensors

print(f"tensor.matmul(tensor.T) \n {tensor.matmul(tensor.T)} \n")
# Alternative syntax:
print(f"tensor @ tensor.T \n {tensor @ tensor.T}")

tensor.matmul(tensor.T) 
 tensor([[3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.]]) 

tensor @ tensor.T 
 tensor([[3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.],
        [3., 3., 3., 3.]])


TORCH.AUTOGRAD

torch.autograd tracks operations on all tensors which have their requires_grad flag set to True. For tensors that don’t require gradients, setting this attribute to False excludes it from the gradient computation DAG.

The output tensor of an operation will require gradients even if only a single input tensor has requires_grad=True.

In [7]:
'''TORCH.AUTOGRAD
torch.autograd is PyTorch’s automatic differentiation engine that powers neural 
network training. In this section, you will get a conceptual understanding of 
how autograd helps a neural network train. '''

a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)


$$
Q = 3a^3 - b^2
$$

In [8]:
Q = 3*a**3 - b**2
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

#Gradients are now deposited in a.grad and b.grad
# check if collected gradients are correct
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


Neural Network

Neural networks can be constructed using the torch.nn package.

*Now that you had a glimpse of autograd, nn depends on autograd to define models and differentiate them. An nn.Module contains layers, and a method forward(input) that returns the output.*


A typical training procedure for a neural network is as follows:

- Define the neural network that has some learnable parameters (or weights)
- Iterate over a dataset of inputs
- Process input through the network
- Compute the loss (how far is the output from being correct)
- Propagate gradients back into the network’s parameters
- Update the weights of the network, typically using a simple update rule: weight = weight - learning_rate * gradient

---

Let’s understand PyTorch through a more practical lens. Learning theory is good, but it isn’t much use if you don’t put it into practice!

A PyTorch implementation of a neural network looks exactly like a NumPy implementation. The goal of this section is to showcase the equivalent nature of PyTorch and NumPy. For this purpose, let’s create a simple three-layered network having 5 nodes in the input layer, 3 in the hidden layer, and 1 in the output layer. We will use only one training example with one row which has five features and one target.

In [9]:
#Basic Neural net
n_input , n_hidden , n_output = 5 , 3 , 1

The first step is to do parameter initialization. Here, the weights and bias parameters for each layer are initialized as the tensor variables. Tensors are the base data structures of PyTorch which are used for building different types of neural networks. They can be considered as the generalization of arrays and matrices; in other words, tensors are N-dimensional matrices

In [10]:
## initialize tensor for inputs, and outputs 
x = torch.randn((1, n_input))
y = torch.randn((1, n_output))
print(x)
print(y)
## initialize tensor variables for weights 
w1 = torch.randn(n_input, n_hidden) # weight for hidden layer
w2 = torch.randn(n_hidden, n_output) # weight for output layer
print(w1)
print(w2)
## initialize tensor variables for bias terms 
b1 = torch.randn((1, n_hidden)) # bias for hidden layer
b2 = torch.randn((1, n_output)) # bias for output layer
print(b1)
print(b2)

tensor([[ 1.3645,  1.2006, -0.1160, -1.4808, -0.3872]])
tensor([[-0.7573]])
tensor([[-0.0557,  0.3002,  1.5590],
        [ 0.1163,  0.0174,  1.3291],
        [-0.1112,  0.5747, -1.6353],
        [-2.2377, -1.3319, -1.8946],
        [ 0.2566, -0.3503,  0.9630]])
tensor([[-0.4073],
        [ 0.3621],
        [ 0.6168]])
tensor([[-0.7392, -0.5973, -0.8756]])
tensor([[-1.5617]])


After the parameter initialization step, a neural network can be defined and trained in four key steps:

- Forward Propagation
- Loss computation
- Backpropagation
- Updating the parameters

Let’s see each of these steps in a bit more detail.

***Forward Propagation:*** In this step, activations are calculated at every layer using the two steps shown below. These activations flow in the forward direction from the input layer to the output layer in order to generate the final output.

1. *z = weight * input + bias*
2. *a = activation_function (z)*

The following code blocks show how we can write these steps in PyTorch. Notice that most of the functions, such as exponential and matrix multiplication, are similar to the ones in NumPy.

In [11]:
#Sigmoid function using pytorch

def sigmoid_activation(z):
    return 1 / (1 + torch.exp(-z))

In [12]:
#activation of hidden layers
#https://pytorch.org/docs/stable/generated/torch.mm.html
z1 = torch.mm(x , w1) + b1
a1 = sigmoid_activation(z1)

print(z1)
print(a1)


tensor([[2.5516, 1.8745, 5.4696]])
tensor([[0.9277, 0.8670, 0.9958]])


In [13]:
#Activation for out put layer
z2 = torch.mm(a1 , w2) + b2
output = sigmoid_activation(z2)

print(z2)
print(output)

tensor([[-1.0115]])
tensor([[0.2667]])



**Loss Computation:** In this step, the error (also called loss) is calculated in the output layer. A simple loss function can tell the difference between the actual value and the predicted value. Later, we will look at different loss functions available in PyTorch

In [14]:
loss = y - output
print(loss)

tensor([[-1.0240]])


**Backpropagation**: The aim of this step is to minimize the error in the output layer by making marginal changes in the bias and the weights. These marginal changes are computed using the derivatives of the error term.

Based on the Calculus principle of the Chain rule, the delta changes are back passed to hidden layers where corresponding changes in their weights and bias are made. This leads to an adjustment in the weights and bias until the error is minimized.

In [15]:
#Function to calculate the derivative
def sigmoid_delta(x):
  return x*(1-x)

In [16]:
#Computes derivative of error terms
delta_output = sigmoid_delta(output)
delta_hidden =  sigmoid_delta(a1)

print(delta_output)
print(delta_hidden)

tensor([[0.1956]])
tensor([[0.0671, 0.1153, 0.0042]])


In [17]:
##backpass the changes to previous layers
d_outp = loss * delta_output
loss_h = torch.mm(d_outp, w2.t())
d_hidn = loss_h*delta_hidden

print(d_outp)
print(loss_h)
print(d_hidn)

tensor([[-0.2003]])
tensor([[ 0.0816, -0.0725, -0.1235]])
tensor([[ 0.0055, -0.0084, -0.0005]])


Updating the Parameters: Finally, the weights and bias are updated using the delta changes received from the above backpropagation step.



In [18]:
learning_rate = 0.1

In [19]:
w2 += torch.mm(a1.t(), d_outp) * learning_rate
w1 += torch.mm(x.t(), d_hidn) * learning_rate

print(w2)
print(w1)

tensor([[-0.4259],
        [ 0.3447],
        [ 0.5968]])
tensor([[-0.0549,  0.2991,  1.5590],
        [ 0.1170,  0.0164,  1.3290],
        [-0.1113,  0.5748, -1.6353],
        [-2.2385, -1.3307, -1.8945],
        [ 0.2564, -0.3500,  0.9630]])


In [20]:
b2 +=d_outp.sum()*learning_rate
b1 +=d_hidn.sum()*learning_rate
print(b2)
print(b1)

tensor([[-1.5817]])
tensor([[-0.7395, -0.5977, -0.8759]])


Finally, when these steps are executed for a number of epochs with a large number of training examples, the loss is reduced to a minimum value. The final weight and bias values are obtained which can then be used to make predictions on the unseen data

**ConvNet**

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    #1 inpuut image channel , 6 output,3x3 square Convolution kernel
    self.conv1 = nn.Conv2d(1,6,3)
    self.conv2 = nn.Conv2d(6,16,3)

    self.fc1 = nn.Linear(16 * 6 * 6 , 120)
    self.fc2 = nn.Linear(120 , 84)
    self.fc3 = nn.Linear(84 , 10)

    def forward(self,x):
      #Max pooling over (2,2) window
      x = F.max_pool2d(F.relu(self.conv1(x)),(2,2))
      x = F.max_pool2d(F.relu(self.conv2(x)),2)
      x = x.view(-1 , self.num_flat_features(x))
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.fc3(x)
      return x
    
    def num_flat_features(self ,x):
      size = x.size()[1:]
      num_features = 1
      for s in size:
        num_features *= s
      return num_features

net = Net()
print(net)


Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [22]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 3, 3])


Let’s try a random 32x32 input. Note: expected input size of this net (LeNet) is 32x32. To use this net on the MNIST dataset, please resize the images from the dataset to 32x32.

---

- torch.Tensor - A multi-dimensional array with support for autograd operations like backward(). Also holds the gradient w.r.t. the tensor.
- nn.Module - Neural network module. Convenient way of encapsulating parameters, with helpers for moving them to GPU, exporting, loading, etc.
- nn.Parameter - A kind of Tensor, that is automatically registered as a parameter when assigned as an attribute to a Module.
- autograd.Function - Implements forward and backward definitions of an autograd operation. Every Tensor operation creates at least a single Function node that connects to functions that created a Tensor and encodes its history.


In [23]:
'''
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss
'''

'\ninput -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d\n      -> view -> linear -> relu -> linear -> relu -> linear\n      -> MSELoss\n      -> loss\n'

Simple Neural Network
At its core, PyTorch provides two main features:

- An n-dimensional Tensor, similar to numpy but can run on GPUs
- Automatic differentiation for building and training neural networks

In [24]:
# -*- coding: utf-8 -*-
# Using Numpy
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 293.10181301084606
199 199.9850253756452
299 137.48055379370865
399 95.49062710676895
499 67.25855462373715
599 48.26016518234365
699 35.46400309849854
799 26.837320119857818
899 21.01600694947048
999 17.083925448170366
1099 14.425286600108127
1199 12.625828575036008
1299 11.406616679636533
1399 10.579664210261377
1499 10.018159061343574
1599 9.636473420996552
1699 9.376731011666974
1799 9.19977311959191
1899 9.079077630996775
1999 8.99666209842717
Result: y = -0.010380849961281679 + 0.8478679324111694 x + 0.0017908693564100595 x^2 + -0.09206829476482772 x^3


PyTorch: Tensors


Numpy is a great framework, but it cannot utilize GPUs to accelerate its numerical computations. For modern deep neural networks, GPUs often provide speedups of 50x or greater, so unfortunately numpy won’t be enough for modern deep learning.

Here we introduce the most fundamental PyTorch concept: the Tensor. A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors. Behind the scenes, Tensors can keep track of a computational graph and gradients, but they’re also useful as a generic tool for scientific computing.

Also unlike numpy, PyTorch Tensors can utilize GPUs to accelerate their numeric computations. To run a PyTorch Tensor on GPU, you simply need to specify the correct device.

In [25]:
# -*- coding: utf-8 -*-

import torch
import math


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 902.4918823242188
199 614.76318359375
299 420.1158752441406
399 288.30194091796875
499 198.94354248046875
599 138.30067443847656
699 97.1005859375
799 69.07864379882812
899 49.998409271240234
999 36.99180603027344
1099 28.115312576293945
1199 22.05055809020996
1299 17.902122497558594
1399 15.061223030090332
1499 13.113500595092773
1599 11.776597023010254
1699 10.857935905456543
1799 10.225961685180664
1899 9.790727615356445
1999 9.490652084350586
Result: y = 0.02253175526857376 + 0.8711710572242737 x + -0.003887103172019124 x^2 + -0.09538295865058899 x^3


Autograd

In the above examples, we had to manually implement both the forward and backward passes of our neural network. Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.

Thankfully, we can use automatic differentiation to automate the computation of backward passes in neural networks. The autograd package in PyTorch provides exactly this functionality. When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

In [26]:
# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 172.9054718017578
199 120.36891174316406
299 84.73808288574219
399 60.54649353027344
499 44.10340118408203
599 32.91468048095703
699 25.29262924194336
799 20.094390869140625
899 16.54515266418457
999 14.119000434875488
1099 12.458662033081055
1199 11.321118354797363
1299 10.5408296585083
1399 10.005022048950195
1499 9.636653900146484
1599 9.383135795593262
1699 9.208451271057129
1799 9.087963104248047
1899 9.004761695861816
1999 8.947249412536621
Result: y = -0.010156458243727684 + 0.862734317779541 x + 0.0017521579284220934 x^2 + -0.09418290853500366 x^3
