<a href="https://colab.research.google.com/github/SarthakNarayan/DL-and-ML/blob/master/googlecolab/Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
def sigmoid(x):
    activation = 1/(1 + torch.exp(-x))
    return activation

In [3]:
torch.manual_seed(23)
weights = torch.randn((5,1))
print(weights)
print(weights.shape)
weights = torch.reshape(weights , (1,5))
print("sum of all the weights {}".format(torch.sum(weights)))
print("sum of all the weights {}".format(weights.sum()))
features = torch.randn((5,1))
bias = torch.randn((1,1))
# More advisable to use mm since it is strict about the shapes whereas matmul supports broadcasting
y_hat = torch.add(torch.mm(weights,features),bias)
print("y_hat before activation",y_hat)
y_hat = sigmoid(y_hat)
print("y_hat after activation",y_hat)
print("so we see sigmoid converts any value to between 1 and 0")

tensor([[-0.8733],
        [ 0.4376],
        [-0.4866],
        [-0.7840],
        [-0.2983]])
torch.Size([5, 1])
sum of all the weights -2.004563570022583
sum of all the weights -2.004563570022583
y_hat before activation tensor([[1.2492]])
y_hat after activation tensor([[0.7772]])
so we see sigmoid converts any value to between 1 and 0


In [4]:
# normal * does element wise multiplication
print(torch.tensor([1,2,3])*torch.tensor([1,2,3]))
# torch.dot performs element wise multiplication and sums them
print(torch.dot(torch.tensor([1,2,3]),torch.tensor([1,2,3])))
a = torch.tensor([[1,2,3] , [1,2,3]])
print("Sum along the columns {}".format(torch.sum(a , dim = 0).numpy()))
print("Sum along the rows {}".format(torch.sum(a , dim = 1).numpy()))

tensor([1, 4, 9])
tensor(14)
Sum along the columns [2 4 6]
Sum along the rows [6 6]


In [5]:
a = torch.tensor([[1,2,3],[1,2,3]])
print(a)
print(a[0][0].item())
# .item() only works for a scalar value and not any array
# so a[0].item() will also not work

tensor([[1, 2, 3],
        [1, 2, 3]])
1


In [6]:
# multilayer perceptron
torch.manual_seed(23)
# w*x + b
def multilayer_perceptron(no_of_features,no_of_hidden_units,no_output_nodes):
    x = torch.randn((no_of_features , 1))
    weights = torch.randn((no_of_hidden_units , no_of_features))
    bias = torch.randn((no_of_hidden_units , 1))
    h12 = torch.add(torch.mm(weights,x),bias)
    print(h12)
    h12 = sigmoid(h12)
    print(h12)
    weights = torch.randn((no_output_nodes , h12.shape[0]))
    bias = torch.randn((no_output_nodes , 1))
    h = torch.add(torch.mm(weights,h12),bias)
    print(h)
    y_hat = sigmoid(h)
    print(y_hat)

multilayer_perceptron(2,3,1)

tensor([[ 0.2699],
        [-1.5522],
        [ 2.2264]])
tensor([[0.5671],
        [0.1748],
        [0.9026]])
tensor([[-1.2120]])
tensor([[0.2293]])


In [7]:
# bridging numpy array with pytorch
import numpy as np
a = np.random.randn(1,2)
print(a)
b = torch.from_numpy(a)
print(b)
print("Back to numpy")
print(b.numpy())

[[ 0.30194012 -1.34358464]]
tensor([[ 0.3019, -1.3436]], dtype=torch.float64)
Back to numpy
[[ 0.30194012 -1.34358464]]


In [9]:
# Using mnist data on my perceptron network
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor()])

# to get the test set you set train = False
trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)
print(len(testset))

10000


In [10]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True)

testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=True)
print("Total number of batches {}".format(len(testloader)))

Total number of batches 157


In [11]:
images , labels = next(iter(testloader))
print(images.shape)
images = images.reshape((64,-1))
print(images.shape)
torch.manual_seed(23)

# x*w + b
def multilayer_for_mnist(features,no_of_hidden_units,no_of_outputs):
    x = features
    w1 = torch.randn((features.shape[1] , no_of_hidden_units))
    b1 = torch.randn((features.shape[0] , no_of_hidden_units))
    h12 = torch.add(torch.matmul(x,w1) , b1)
    h12 = sigmoid(h12)
    w2 = torch.randn((h12.shape[1] , no_of_outputs))
    b2 = torch.randn((h12.shape[0] , no_of_outputs))
    h = torch.add(torch.matmul(h12,w2) , b2)
    y_hat = sigmoid(h)
    print(y_hat)
    print(y_hat.shape)
    return y_hat

prediction = multilayer_for_mnist(images , 256 , 10)

torch.Size([64, 1, 28, 28])
torch.Size([64, 784])
tensor([[1.3193e-05, 1.2243e-06, 1.8061e-08, 9.9994e-01, 9.9997e-01, 2.6585e-04,
         1.0177e-02, 1.2834e-01, 9.9295e-01, 1.6512e-01],
        [1.5857e-07, 1.2658e-04, 4.4665e-05, 9.9988e-01, 9.9998e-01, 5.1002e-05,
         1.0000e+00, 1.2840e-09, 9.9731e-01, 6.9557e-07],
        [6.2602e-05, 1.0784e-05, 1.3269e-09, 9.9971e-01, 9.9987e-01, 9.8132e-01,
         8.1723e-03, 4.1878e-04, 9.8537e-01, 8.7561e-01],
        [5.3138e-04, 1.1886e-01, 4.4868e-02, 9.9583e-01, 1.5463e-07, 2.6593e-04,
         7.6947e-01, 2.3076e-05, 1.0000e+00, 5.7209e-06],
        [2.5221e-08, 2.8505e-05, 1.6951e-01, 9.1777e-01, 1.0000e+00, 3.4459e-04,
         9.8505e-01, 9.6499e-01, 9.9554e-01, 6.2495e-09],
        [7.9909e-07, 3.1230e-02, 3.6103e-12, 5.9542e-01, 1.0000e+00, 2.1398e-01,
         3.1596e-06, 1.4038e-07, 1.0000e+00, 1.9913e-02],
        [7.2242e-04, 6.4985e-04, 6.4096e-06, 8.0812e-01, 1.8243e-04, 3.6842e-03,
         9.8795e-01, 1.9260e-05, 1.

In [12]:
def softmax(x):
    x = torch.exp(x)
    values = []
    print(x.shape)
    for i in range(x.shape[0]):
        x[i] = x[i]/torch.sum(x[i] , dim = 0)
    print(x.shape)
    return x

pred = softmax(prediction)
print(pred)
print(pred[0].sum(dim = 0))

torch.Size([64, 10])
torch.Size([64, 10])
tensor([[0.0647, 0.0647, 0.0647, 0.1758, 0.1758, 0.0647, 0.0653, 0.0735, 0.1746,
         0.0763],
        [0.0593, 0.0593, 0.0593, 0.1612, 0.1612, 0.0593, 0.1612, 0.0593, 0.1607,
         0.0593],
        [0.0550, 0.0550, 0.0550, 0.1494, 0.1494, 0.1467, 0.0554, 0.0550, 0.1473,
         0.1320],
        [0.0678, 0.0763, 0.0709, 0.1834, 0.0678, 0.0678, 0.1463, 0.0678, 0.1842,
         0.0678],
        [0.0543, 0.0543, 0.0643, 0.1360, 0.1476, 0.0543, 0.1454, 0.1425, 0.1469,
         0.0543],
        [0.0688, 0.0710, 0.0688, 0.1247, 0.1869, 0.0852, 0.0688, 0.0688, 0.1869,
         0.0702],
        [0.0683, 0.0683, 0.0682, 0.1531, 0.0683, 0.0685, 0.1833, 0.0682, 0.1855,
         0.0682],
        [0.0653, 0.0694, 0.0713, 0.1775, 0.0653, 0.0653, 0.1776, 0.0653, 0.1776,
         0.0653],
        [0.0601, 0.0744, 0.0612, 0.1625, 0.1494, 0.0601, 0.1488, 0.0601, 0.1633,
         0.0601],
        [0.0618, 0.1674, 0.1429, 0.1510, 0.0617, 0.0624, 0.0618, 0.

In pytorch it is a convention to assign criterion = nn.loss() class. <br/>
Eg: - criterion = nn.CrossEntropyLoss()<br/>
So the expected input to these loss function is the logits or the scores and not the softmax probablities.
Eg: given below

In [13]:
# New way of creating a sequential model
import torch.nn as nn
import torch.nn.functional as F
model = nn.Sequential(nn.Linear(784 , 128),
                   nn.ReLU(),
                   nn.Linear(128 , 64),
                   nn.ReLU(),
                   nn.Linear(64,10))

images , labels = next(iter(trainloader))
images = images.reshape(images.shape[0] , -1)
criterion = nn.CrossEntropyLoss()
logits = model(images)
# so we see we are passing logits i.e. original values rather than the softmax probabilities
loss = criterion(logits , labels)
print(loss)

tensor(2.3045, grad_fn=<NllLossBackward>)


Pytorch has this really great class named autograd which keeps track of the tensor operations performed by us and when you tell it to do a backwards pass it will go backwards through each of these operations and calculate gradients wrt the input parameters.<br/>
In general we need to tell pytorch that we want to use auto grad on a specific tensor.
Eg: - 

In [14]:
a = torch.tensor([1,2,3] , requires_grad=True , dtype = torch.float64)
print(a)
# this will tell pytorch to track the operations of this tensor and it can compute its gradient whenever needed.
# you can also do it using
with torch.no_grad():
    b = torch.tensor([1,2,3] , dtype = torch.float64)
print(b.requires_grad)

# you can also do it globally for all the variables using
# torch.set_grad_enabled(True)

tensor([1., 2., 3.], dtype=torch.float64, requires_grad=True)
False


In [15]:
# Using autograd to compute the gradients
# we just do a variable.backward() if we want to compute its graident
a = torch.tensor([1,2,3] , requires_grad=True , dtype = torch.float64)
y = (a ** 2).sum(dim = 0)
# we have to do sum because we can perform backward pass only on a scalar value and not any vector
print(y)
print("Gradient without performing the backward pass {}".format(a.grad))
y.backward()
print("Gradient after performing the backward pass {}".format(a.grad.numpy()))

tensor(14., dtype=torch.float64, grad_fn=<SumBackward2>)
Gradient without performing the backward pass None
Gradient after performing the backward pass [2. 4. 6.]


Once we have our gradients we need optimizers to update the weights by using the gradients.<br/>
We need to clear the gradients because pytorch accumulates gradients and we do it using
**optimizer.zero_grad() before every training process**.<br/>
A step with the optimizer updates the weights.

In [16]:
# incase of neural networks pytorch automatically computes the gradient of weights by using autograd to note their computations
print("Gradients of the weights before backward pass {}".format(model[0].weight.grad))
loss.backward()
print("Gradients of the weights after backward pass {}".format(model[0].weight.grad))

Gradients of the weights before backward pass None
Gradients of the weights after backward pass tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [17]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters() , lr = 1e-3)
optimizer.zero_grad()
print("weights before stepping" , model[0].weight)
optimizer.step()
print("Weights after stepping" , model[0].weight)
# not much of a difference since our graident was different

weights before stepping Parameter containing:
tensor([[ 0.0122, -0.0171,  0.0267,  ...,  0.0208, -0.0134, -0.0082],
        [ 0.0259, -0.0189, -0.0171,  ..., -0.0160, -0.0106,  0.0283],
        [ 0.0339, -0.0078, -0.0324,  ...,  0.0244,  0.0031,  0.0294],
        ...,
        [-0.0109,  0.0095,  0.0027,  ...,  0.0306, -0.0105,  0.0120],
        [-0.0180, -0.0165,  0.0315,  ..., -0.0054,  0.0217,  0.0048],
        [ 0.0267,  0.0102,  0.0338,  ...,  0.0135,  0.0035, -0.0196]],
       requires_grad=True)
Weights after stepping Parameter containing:
tensor([[ 0.0122, -0.0171,  0.0267,  ...,  0.0208, -0.0134, -0.0082],
        [ 0.0259, -0.0189, -0.0171,  ..., -0.0160, -0.0106,  0.0283],
        [ 0.0339, -0.0078, -0.0324,  ...,  0.0244,  0.0031,  0.0294],
        ...,
        [-0.0109,  0.0095,  0.0027,  ...,  0.0306, -0.0105,  0.0120],
        [-0.0180, -0.0165,  0.0315,  ..., -0.0054,  0.0217,  0.0048],
        [ 0.0267,  0.0102,  0.0338,  ...,  0.0135,  0.0035, -0.0196]],
       require

Since for validation we dont need to train there is no need of having autograd track all the variables. So we do <br/>
with torch.no_grad(): <br/>
for images, labels in testloader <br/>
 We only need enumerate if we want to keep track of the number of epochs for verbosity.<br/>
Put the validation loop inside the with segment. It saves us some computation.<br/>
The general idea is after each forward pass of the epoch we want to calculate our validation accuracy. Eg: 

In [0]:
# Using dropouts in a model
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self):
        super(Network , self).__init__()
        self.fc1 = nn.Linear(784 , 256)
        self.fc2 = nn.Linear(256 , 128)
        self.fc3 = nn.Linear(128 , 64)
        self.fc4 = nn.Linear(64 , 10)
        
        self.dropout = nn.Dropout(p=0.2)
    def forward(self , x):
        x = x.reshape(x.shape[0] , -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.fc4(x)
        return x
# Better of using GPU
net = Network().cuda()

Using dropouts<br/>
Dont use dropout in the last layer.<br/>
We want to do use dropout only for training and not for testing hence we have to use something known as model.eval().<br/>
It turns of dropouts when we are doing validation,testing or even predictions.<br/>
Then again to set our model back to training mode we use model.train(). This is particularly important when we are calculating validation accuracy since we will be training first then calculating the accuracy for that epoch and again doing the training for the next epoch so if we dont do model.train() our model wont consider dropouts while training.

In [23]:
# Calculating validation accuracy along with training
num_epochs = 3
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters() , lr = 1e-3)
for i in range(num_epochs):
    running_loss = 0
    counter = 0
    for images , labels in trainloader:
        images = images.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        logits = net(images)
        loss = criterion(logits , labels)
        running_loss += loss
        loss.backward()
        optimizer.step()
    # since you are calculating loss for the whole batch and not each image
    print("Training Loss after {} epoch is {}".format(i , (running_loss/len(trainloader))))
    
    net.eval()
    correct = 0
    accuracy = 0
    for images_test , labels_test in testloader:
        with torch.no_grad():
            images_test = images_test.cuda()
            labels_test = labels_test.cuda()
            pred = net(images_test)
            values , indices = torch.max(pred , 1)
            for j in range(len(indices)):
                if(indices[j] == labels_test[j]):
                    correct = correct + 1
    accuracy = (correct/len(testset))*100
    # we can print validation loss if we want
    print("Validation accuracy after {} epoch is {}".format(i , accuracy))
    net.train()

Training Loss after 0 epoch is 0.3807568848133087
Validation accuracy after 0 epoch is 95.94
Training Loss after 1 epoch is 0.1532614380121231
Validation accuracy after 1 epoch is 97.28
Training Loss after 2 epoch is 0.11407080292701721
Validation accuracy after 2 epoch is 97.38


In [28]:
# using torch.max()
a = torch.tensor([[1,2,3],[1,2,3]])
value , index = torch.max(a , 1)
print(value,index)

tensor([3, 3]) tensor([2, 2])


In [82]:
print(net)
# print(net.state_dict())
print(net.state_dict().keys())
# to see the weights and gradients of any layer
print(net.fc1.weight)
print(net.fc1.weight.grad)

Network(
  (fc1): Linear(in_features=784, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
  (dropout): Dropout(p=0.2)
)
odict_keys(['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'fc3.weight', 'fc3.bias', 'fc4.weight', 'fc4.bias'])
Parameter containing:
tensor([[ 0.0117,  0.0168, -0.0059,  ..., -0.0282,  0.0282, -0.0257],
        [ 0.0303,  0.0050, -0.0117,  ...,  0.0045,  0.0092,  0.0134],
        [ 0.0160,  0.0058,  0.0004,  ..., -0.0165, -0.0272,  0.0135],
        ...,
        [ 0.0278, -0.0309,  0.0178,  ...,  0.0137, -0.0182,  0.0210],
        [ 0.0250, -0.0041, -0.0332,  ...,  0.0018,  0.0316, -0.0334],
        [ 0.0076, -0.0048,  0.0079,  ...,  0.0194, -0.0271,  0.0053]],
       requires_grad=True)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0

For loading datasets we use torchvision<br/>
trainset = dataset.ImageFolder('path' , transform = transforms)<br/>
It expects that different classes should be in different folders<br/>
Dont play too much with the transforms of test data set.<br/>
Most common transforms for both training and testing are random crop , resize , totensor 
and rotation,horizontal flip for training.

**Transfer Learning**<br/>
Most of the models are pretrained on input images of 224*224 also we will need to match the normalization. The means are [0.485 , 0.456 , 0.406] and std is [0.229 , 0.224 , 0.225]<br/>


In [83]:
from torchvision import models,datasets,transforms
transfer_model = models.densenet121(pretrained=True)
print(transfer_model)

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/checkpoints/densenet121-a639ec97.pth
100%|██████████| 32342954/32342954 [00:00<00:00, 112757313.38it/s]


DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplac

In [0]:
# We need to retrain the classifier part and keep the feature part static
for param in transfer_model.parameters():
    param.requires_grad = False
# This will make sure all the parameters are frozen and we dont compute their gradients hence making the execution faster
