<a href="https://colab.research.google.com/github/SarthakNarayan/DL-and-ML/blob/master/googlecolab/Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
def sigmoid(x):
    activation = 1/(1 + torch.exp(-x))
    return activation

In [0]:
torch.manual_seed(23)
weights = torch.randn((5,1))
print(weights)
print(weights.shape)
weights = torch.reshape(weights , (1,5))
print("sum of all the weights {}".format(torch.sum(weights)))
print("sum of all the weights {}".format(weights.sum()))
features = torch.randn((5,1))
bias = torch.randn((1,1))
# More advisable to use mm since it is strict about the shapes whereas matmul supports broadcasting
y_hat = torch.add(torch.mm(weights,features),bias)
print("y_hat before activation",y_hat)
y_hat = sigmoid(y_hat)
print("y_hat after activation",y_hat)
print("so we see sigmoid converts any value to between 1 and 0")

tensor([[-0.8733],
        [ 0.4376],
        [-0.4866],
        [-0.7840],
        [-0.2983]])
torch.Size([5, 1])
sum of all the weights -2.004563570022583
sum of all the weights -2.004563570022583
y_hat before activation tensor([[1.2492]])
y_hat after activation tensor([[0.7772]])
so we see sigmoid converts any value to between 1 and 0


In [0]:
# normal * does element wise multiplication
print(torch.tensor([1,2,3])*torch.tensor([1,2,3]))
# torch.dot performs element wise multiplication and sums them
print(torch.dot(torch.tensor([1,2,3]),torch.tensor([1,2,3])))
a = torch.tensor([[1,2,3] , [1,2,3]])
print("Sum along the columns {}".format(torch.sum(a , dim = 0).numpy()))
print("Sum along the rows {}".format(torch.sum(a , dim = 1).numpy()))

tensor([1, 4, 9])
tensor(14)
Sum along the columns [2 4 6]
Sum along the rows [6 6]


In [0]:
a = torch.tensor([[1,2,3],[1,2,3]])
print(a)
print(a[0][0].item())
# .item() only works for a scalar value and not any array
# so a[0].item() will also not work

tensor([[1, 2, 3],
        [1, 2, 3]])
1


In [0]:
# multilayer perceptron
torch.manual_seed(23)
# w*x + b
def multilayer_perceptron(no_of_features,no_of_hidden_units,no_output_nodes):
    x = torch.randn((no_of_features , 1))
    weights = torch.randn((no_of_hidden_units , no_of_features))
    bias = torch.randn((no_of_hidden_units , 1))
    h12 = torch.add(torch.mm(weights,x),bias)
    print(h12)
    h12 = sigmoid(h12)
    print(h12)
    weights = torch.randn((no_output_nodes , h12.shape[0]))
    bias = torch.randn((no_output_nodes , 1))
    h = torch.add(torch.mm(weights,h12),bias)
    print(h)
    y_hat = sigmoid(h)
    print(y_hat)

multilayer_perceptron(2,3,1)

tensor([[ 0.2699],
        [-1.5522],
        [ 2.2264]])
tensor([[0.5671],
        [0.1748],
        [0.9026]])
tensor([[-1.2120]])
tensor([[0.2293]])


In [0]:
# bridging numpy array with pytorch
import numpy as np
a = np.random.randn(1,2)
print(a)
b = torch.from_numpy(a)
print(b)
print("Back to numpy")
print(b.numpy())

[[ 0.22598667 -0.71195098]]
tensor([[ 0.2260, -0.7120]], dtype=torch.float64)
Back to numpy
[[ 0.22598667 -0.71195098]]


In [0]:
# Using mnist data on my perceptron network
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor()])

# to get the test set you set train = False
trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)

In [0]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True)

testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=True)

In [114]:
images , labels = next(iter(testloader))
print(images.shape)
images = images.reshape((64,-1))
print(images.shape)
torch.manual_seed(23)

# x*w + b
def multilayer_for_mnist(features,no_of_hidden_units,no_of_outputs):
    x = features
    w1 = torch.randn((features.shape[1] , no_of_hidden_units))
    b1 = torch.randn((features.shape[0] , no_of_hidden_units))
    h12 = torch.add(torch.matmul(x,w1) , b1)
    h12 = sigmoid(h12)
    w2 = torch.randn((h12.shape[1] , no_of_outputs))
    b2 = torch.randn((h12.shape[0] , no_of_outputs))
    h = torch.add(torch.matmul(h12,w2) , b2)
    y_hat = sigmoid(h)
    print(y_hat)
    print(y_hat.shape)
    return y_hat

prediction = multilayer_for_mnist(images , 256 , 10)

torch.Size([64, 1, 28, 28])
torch.Size([64, 784])
tensor([[1.0140e-06, 1.6277e-03, 1.7026e-08, 9.8876e-01, 9.9998e-01, 1.0787e-03,
         1.8629e-03, 4.0380e-02, 1.0000e+00, 3.9304e-03],
        [2.5650e-08, 7.9059e-01, 3.6959e-02, 9.9790e-01, 9.7231e-01, 6.0422e-03,
         3.6543e-02, 1.5973e-04, 9.9998e-01, 2.9609e-06],
        [3.0122e-08, 2.8184e-02, 3.5271e-03, 8.3564e-01, 9.2835e-02, 9.7794e-06,
         1.4428e-02, 1.1955e-03, 9.9934e-01, 1.4494e-06],
        [8.5594e-05, 1.5545e-03, 1.6076e-02, 1.0348e-02, 9.6728e-01, 2.2726e-06,
         1.2918e-04, 2.7442e-07, 1.0000e+00, 9.5435e-02],
        [5.0165e-09, 1.0841e-03, 2.7352e-04, 9.9992e-01, 9.9986e-01, 9.4051e-04,
         1.0000e+00, 1.7940e-08, 9.9997e-01, 9.3537e-03],
        [5.3375e-06, 2.4105e-02, 2.1962e-04, 3.9990e-01, 9.9632e-01, 5.2315e-01,
         1.0000e+00, 8.4743e-12, 1.0000e+00, 2.4772e-03],
        [1.5910e-05, 3.1016e-08, 3.9229e-07, 9.2637e-01, 9.9994e-01, 6.8571e-04,
         9.3886e-01, 3.6040e-06, 9.

In [115]:
def softmax(x):
    x = torch.exp(x)
    values = []
    print(x.shape)
    for i in range(x.shape[0]):
    x[i] = x[i]/torch.sum(x[i] , dim = 0)
    print(x.shape)
    return x

pred = softmax(prediction)
print(pred)
print(pred[0].sum(dim = 0))

torch.Size([64, 10])
torch.Size([64, 10])
tensor([[0.0659, 0.0660, 0.0659, 0.1771, 0.1791, 0.0660, 0.0660, 0.0686, 0.1791,
         0.0662],
        [0.0611, 0.1348, 0.0634, 0.1658, 0.1616, 0.0615, 0.0634, 0.0611, 0.1661,
         0.0611],
        [0.0759, 0.0781, 0.0762, 0.1751, 0.0833, 0.0759, 0.0770, 0.0760, 0.2063,
         0.0759],
        [0.0742, 0.0743, 0.0754, 0.0750, 0.1952, 0.0742, 0.0742, 0.0742, 0.2017,
         0.0816],
        [0.0592, 0.0593, 0.0592, 0.1610, 0.1610, 0.0593, 0.1610, 0.0592, 0.1610,
         0.0598],
        [0.0612, 0.0627, 0.0612, 0.0912, 0.1656, 0.1032, 0.1662, 0.0612, 0.1662,
         0.0613],
        [0.0605, 0.0605, 0.0605, 0.1528, 0.1645, 0.0606, 0.1547, 0.0605, 0.1619,
         0.0634],
        [0.0627, 0.1631, 0.0627, 0.0704, 0.1318, 0.0627, 0.0627, 0.0627, 0.1703,
         0.1509],
        [0.0586, 0.1587, 0.0623, 0.1592, 0.1587, 0.0586, 0.0586, 0.0586, 0.1592,
         0.0676],
        [0.0960, 0.0676, 0.0631, 0.1716, 0.1701, 0.0632, 0.0682, 0.

In pytorch it is a convention to assign criterion = nn.loss() class. <br/>
Eg: - criterion = nn.CrossEntropyLoss()<br/>
So the expected input to these loss function is the logits or the scores and not the softmax probablities.
Eg: given below

In [0]:
# New way of creating a sequential model
import torch.nn as nn
import torch.nn.functional as F
model = nn.Sequential(nn.Linear(784 , 128),
                   nn.ReLU(),
                   nn.Linear(128 , 64),
                   nn.ReLU(),
                   nn.Linear(64,10))

images , labels = next(iter(trainloader))
images = images.reshape(images.shape[0] , -1)
criterion = nn.CrossEntropyLoss()
logits = model(images)
# so we see we are passing logits i.e. original values rather than the softmax probabilities
loss = criterion(logits , labels)
print(loss)

tensor(2.3073, grad_fn=<NllLossBackward>)


Pytorch has this really great class named autograd which keeps track of the tensor operations performed by us and when you tell it to do a backwards pass it will go backwards through each of these operations and calculate gradients wrt the input parameters.<br/>
In general we need to tell pytorch that we want to use auto grad on a specific tensor.
Eg: - 

In [0]:
a = torch.tensor([1,2,3] , requires_grad=True , dtype = torch.float64)
print(a)
# this will tell pytorch to track the operations of this tensor and it can compute its gradient whenever needed.
# you can also do it using
with torch.no_grad():
    b = torch.tensor([1,2,3] , dtype = torch.float64)
print(b.requires_grad)

# you can also do it globally for all the variables using
# torch.set_grad_enabled(True)

tensor([1., 2., 3.], dtype=torch.float64, requires_grad=True)
False


In [0]:
# Using autograd to compute the gradients
# we just do a variable.backward() if we want to compute its graident
a = torch.tensor([1,2,3] , requires_grad=True , dtype = torch.float64)
y = (a ** 2).sum(dim = 0)
# we have to do sum because we can perform backward pass only on a scalar value
print(y)
print("Gradient without performing the backward pass {}".format(a.grad))
y.backward()
print("Gradient after performing the backward pass {}".format(a.grad.numpy()))

tensor(14., dtype=torch.float64, grad_fn=<SumBackward2>)
Gradient without performing the backward pass None
Gradient after performing the backward pass [2. 4. 6.]


Once we have our gradients we need optimizers to update the weights by using the gradients.<br/>
We need to clear the gradients because pytorch accumulates gradients and we do it using
**optimizer.zero_grad() before every training process**.<br/>
A step with the optimizer updates the weights.

In [0]:
# incase of neural networks pytorch automatically computes the gradient of weights by using autograd to note their computations
print("Gradients of the weights before backward pass {}".format(model[0].weight.grad))
loss.backward()
print("Gradients of the weights after backward pass {}".format(model[0].weight.grad))

Gradients of the weights before backward pass None
Gradients of the weights after backward pass tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [0]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters() , lr = 1e-3)
optimizer.zero_grad()
print("weights before stepping" , model[0].weight)
optimizer.step()
print("Weights after stepping" , model[0].weight)
# not much of a difference since our graident was different

weights before stepping Parameter containing:
tensor([[-0.0308, -0.0258,  0.0248,  ..., -0.0216,  0.0269, -0.0173],
        [ 0.0106, -0.0249, -0.0019,  ..., -0.0135, -0.0078, -0.0038],
        [-0.0121, -0.0326,  0.0062,  ...,  0.0217, -0.0090,  0.0092],
        ...,
        [-0.0030, -0.0177, -0.0318,  ...,  0.0315, -0.0259,  0.0139],
        [-0.0119,  0.0120,  0.0171,  ..., -0.0354, -0.0159, -0.0330],
        [ 0.0293,  0.0183, -0.0165,  ..., -0.0191,  0.0130,  0.0084]],
       requires_grad=True)
Weights after stepping Parameter containing:
tensor([[-0.0308, -0.0258,  0.0248,  ..., -0.0216,  0.0269, -0.0173],
        [ 0.0106, -0.0249, -0.0019,  ..., -0.0135, -0.0078, -0.0038],
        [-0.0121, -0.0326,  0.0062,  ...,  0.0217, -0.0090,  0.0092],
        ...,
        [-0.0030, -0.0177, -0.0318,  ...,  0.0315, -0.0259,  0.0139],
        [-0.0119,  0.0120,  0.0171,  ..., -0.0354, -0.0159, -0.0330],
        [ 0.0293,  0.0183, -0.0165,  ..., -0.0191,  0.0130,  0.0084]],
       require

Since for validation we dont need to train there is no need of having autograd track all the variables. So we do <br/>
with torch.no_grad(): <br/>
for images, labels in testloader <br/>
 We only need enumerate if we want to keep track of the number of epochs for verbosity.<br/>
Put the validation loop inside the with segment. It saves us some computation.<br/>
The general idea is after each forward pass of the epoch we want to calculate our validation accuracy. Eg: 

Using dropouts<br/>
Dont use dropout in the last layer.<br/>
We want to do use dropout only for training and not for testing hence we have to use something known as model.eval().<br/>
It turns of dropouts when we are doing validation,testing or even predictions.<br/>
Then again to set our model back to training mode we use model.train(). This is particularly important when we are calculating validation accuracy since we will be training first then calculating the accuracy for that epoch and again doing the training for the next epoch so if we dont do model.train() our model wont consider dropouts while training.