<a href="https://colab.research.google.com/github/SarthakNarayan/DL-and-ML/blob/master/googlecolab/Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
def sigmoid(x):
    activation = 1/(1 + torch.exp(-x))
    return activation

In [0]:
torch.manual_seed(23)
weights = torch.randn((5,1))
print(weights)
print(weights.shape)
weights = torch.reshape(weights , (1,5))
print("sum of all the weights {}".format(torch.sum(weights)))
print("sum of all the weights {}".format(weights.sum()))
features = torch.randn((5,1))
bias = torch.randn((1,1))
# More advisable to use mm since it is strict about the shapes whereas matmul supports broadcasting
y_hat = torch.add(torch.mm(weights,features),bias)
print("y_hat before activation",y_hat)
y_hat = sigmoid(y_hat)
print("y_hat after activation",y_hat)
print("so we see sigmoid converts any value to between 1 and 0")

In [0]:
# normal * does element wise multiplication
print(torch.tensor([1,2,3])*torch.tensor([1,2,3]))
# torch.dot performs element wise multiplication and sums them
print(torch.dot(torch.tensor([1,2,3]),torch.tensor([1,2,3])))
a = torch.tensor([[1,2,3] , [1,2,3]])
print("Sum along the columns {}".format(torch.sum(a , dim = 0).numpy()))
print("Sum along the rows {}".format(torch.sum(a , dim = 1).numpy()))

In [0]:
a = torch.tensor([[1,2,3],[1,2,3]])
print(a)
print(a[0][0].item())
# .item() only works for a scalar value and not any array
# so a[0].item() will also not work

In [0]:
# multilayer perceptron
torch.manual_seed(23)
# w*x + b
def multilayer_perceptron(no_of_features,no_of_hidden_units,no_output_nodes):
    x = torch.randn((no_of_features , 1))
    weights = torch.randn((no_of_hidden_units , no_of_features))
    bias = torch.randn((no_of_hidden_units , 1))
    h12 = torch.add(torch.mm(weights,x),bias)
    print(h12)
    h12 = sigmoid(h12)
    print(h12)
    weights = torch.randn((no_output_nodes , h12.shape[0]))
    bias = torch.randn((no_output_nodes , 1))
    h = torch.add(torch.mm(weights,h12),bias)
    print(h)
    y_hat = sigmoid(h)
    print(y_hat)

multilayer_perceptron(2,3,1)

In [0]:
# bridging numpy array with pytorch
import numpy as np
a = np.random.randn(1,2)
print(a)
b = torch.from_numpy(a)
print(b)
print("Back to numpy")
print(b.numpy())

In [0]:
# Using mnist data on my perceptron network
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor()])

# to get the test set you set train = False
trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)
print(len(testset))

In [0]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True)

testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=True)
print("Total number of batches {}".format(len(testloader)))

In [0]:
images , labels = next(iter(testloader))
print(images.shape)
images = images.reshape((64,-1))
print(images.shape)
torch.manual_seed(23)

# x*w + b
def multilayer_for_mnist(features,no_of_hidden_units,no_of_outputs):
    x = features
    w1 = torch.randn((features.shape[1] , no_of_hidden_units))
    b1 = torch.randn((features.shape[0] , no_of_hidden_units))
    h12 = torch.add(torch.matmul(x,w1) , b1)
    h12 = sigmoid(h12)
    w2 = torch.randn((h12.shape[1] , no_of_outputs))
    b2 = torch.randn((h12.shape[0] , no_of_outputs))
    h = torch.add(torch.matmul(h12,w2) , b2)
    y_hat = sigmoid(h)
    print(y_hat)
    print(y_hat.shape)
    return y_hat

prediction = multilayer_for_mnist(images , 256 , 10)

In [0]:
def softmax(x):
    x = torch.exp(x)
    values = []
    print(x.shape)
    for i in range(x.shape[0]):
        x[i] = x[i]/torch.sum(x[i] , dim = 0)
    print(x.shape)
    return x

pred = softmax(prediction)
print(pred)
print(pred[0].sum(dim = 0))

In pytorch it is a convention to assign criterion = nn.loss() class. <br/>
Eg: - criterion = nn.CrossEntropyLoss()<br/>
So the expected input to these loss function is the logits or the scores and not the softmax probablities.
Eg: given below

In [0]:
# New way of creating a sequential model
import torch.nn as nn
import torch.nn.functional as F
model = nn.Sequential(nn.Linear(784 , 128),
                   nn.ReLU(),
                   nn.Linear(128 , 64),
                   nn.ReLU(),
                   nn.Linear(64,10))

images , labels = next(iter(trainloader))
images = images.reshape(images.shape[0] , -1)
criterion = nn.CrossEntropyLoss()
logits = model(images)
# so we see we are passing logits i.e. original values rather than the softmax probabilities
loss = criterion(logits , labels)
print(loss)

Pytorch has this really great class named autograd which keeps track of the tensor operations performed by us and when you tell it to do a backwards pass it will go backwards through each of these operations and calculate gradients wrt the input parameters.<br/>
In general we need to tell pytorch that we want to use auto grad on a specific tensor.
Eg: - 

In [0]:
a = torch.tensor([1,2,3] , requires_grad=True , dtype = torch.float64)
print(a)
# this will tell pytorch to track the operations of this tensor and it can compute its gradient whenever needed.
# you can also do it using
with torch.no_grad():
    b = torch.tensor([1,2,3] , dtype = torch.float64)
print(b.requires_grad)

# you can also do it globally for all the variables using
# torch.set_grad_enabled(True)

In [0]:
# Using autograd to compute the gradients
# we just do a variable.backward() if we want to compute its graident
a = torch.tensor([1,2,3] , requires_grad=True , dtype = torch.float64)
y = (a ** 2).sum(dim = 0)
# we have to do sum because we can perform backward pass only on a scalar value and not any vector
print(y)
print("Gradient without performing the backward pass {}".format(a.grad))
y.backward()
print("Gradient after performing the backward pass {}".format(a.grad.numpy()))

Once we have our gradients we need optimizers to update the weights by using the gradients.<br/>
We need to clear the gradients because pytorch accumulates gradients and we do it using
**optimizer.zero_grad() before every training process**.<br/>
A step with the optimizer updates the weights.

In [0]:
# incase of neural networks pytorch automatically computes the gradient of weights by using autograd to note their computations
print("Gradients of the weights before backward pass {}".format(model[0].weight.grad))
loss.backward()
print("Gradients of the weights after backward pass {}".format(model[0].weight.grad))

In [0]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters() , lr = 1e-3)
optimizer.zero_grad()
print("weights before stepping" , model[0].weight)
optimizer.step()
print("Weights after stepping" , model[0].weight)
# not much of a difference since our graident was different

Since for validation we dont need to train there is no need of having autograd track all the variables. So we do <br/>
with torch.no_grad(): <br/>
for images, labels in testloader <br/>
 We only need enumerate if we want to keep track of the number of epochs for verbosity.<br/>
Put the validation loop inside the with segment. It saves us some computation.<br/>
The general idea is after each forward pass of the epoch we want to calculate our validation accuracy. Eg: 

In [0]:
# Using dropouts in a model
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self):
        super(Network , self).__init__()
        self.fc1 = nn.Linear(784 , 256)
        self.fc2 = nn.Linear(256 , 128)
        self.fc3 = nn.Linear(128 , 64)
        self.fc4 = nn.Linear(64 , 10)
        
        self.dropout = nn.Dropout(p=0.2)
    def forward(self , x):
        x = x.reshape(x.shape[0] , -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.fc4(x)
        return x
# Better of using GPU
# To move the model and images back to CPU do .cpu() for the model and the images
net = Network().cuda()

Using dropouts<br/>
Dont use dropout in the last layer.<br/>
We want to do use dropout only for training and not for testing hence we have to use something known as model.eval().<br/>
It turns of dropouts when we are doing validation,testing or even predictions.<br/>
Then again to set our model back to training mode we use model.train(). This is particularly important when we are calculating validation accuracy since we will be training first then calculating the accuracy for that epoch and again doing the training for the next epoch so if we dont do model.train() our model wont consider dropouts while training.

In [0]:
# Calculating validation accuracy along with training
num_epochs = 3
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters() , lr = 1e-3)
for i in range(num_epochs):
    running_loss = 0
    counter = 0
    for images , labels in trainloader:
        images = images.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        logits = net(images)
        loss = criterion(logits , labels)
        running_loss += loss
        loss.backward()
        optimizer.step()
    # since you are calculating loss for the whole batch and not each image
    print("Training Loss after {} epoch is {}".format(i , (running_loss/len(trainloader))))
    
    net.eval()
    correct = 0
    accuracy = 0
    for images_test , labels_test in testloader:
        with torch.no_grad():
            images_test = images_test.cuda()
            labels_test = labels_test.cuda()
            pred = net(images_test)
            values , indices = torch.max(pred , 1)
            for j in range(len(indices)):
                if(indices[j] == labels_test[j]):
                    correct = correct + 1
    accuracy = (correct/len(testset))*100
    # we can print validation loss if we want
    print("Validation accuracy after {} epoch is {}".format(i , accuracy))
    net.train()

In [0]:
# using torch.max()
a = torch.tensor([[1,2,3],[1,2,3]])
value , index = torch.max(a , 1)
print(value,index)

In [0]:
print(net)
# print(net.state_dict())
print(net.state_dict().keys())
# to see the weights and gradients of any layer
print(net.fc1.weight)
print(net.fc1.weight.grad)

For loading datasets we use torchvision<br/>
trainset = dataset.ImageFolder('path' , transform = transforms)<br/>
It expects that different classes should be in different folders<br/>
Dont play too much with the transforms of test data set.<br/>
Most common transforms for both training and testing are random crop , resize , totensor 
and rotation,horizontal flip for training.

**Transfer Learning**<br/>
Most of the models are pretrained on input images of 224*224 also we will need to match the normalization. The means are [0.485 , 0.456 , 0.406] and std is [0.229 , 0.224 , 0.225]<br/>


In [0]:
from torchvision import models,datasets,transforms
transfer_model = models.densenet121(pretrained=True)
print(transfer_model.state_dict().keys())
# odict_keys means ordered dictionary keys 
print("\n Classifier before changing \n" ,transfer_model.classifier)

# we need to change the classifier with our own classifier
from collections import OrderedDict
'''
OrderedDict preserves the order in which the keys are inserted. 
A regular dict doesn’t track the insertion order, 
and iterating it gives the values in an arbitrary order
'''
# you give sequential a list of operations and it will pass the tensor through it sequentially
classifier = nn.Sequential(OrderedDict([
    ('fc1' , nn.Linear(1024,500)),
    ('relu' , nn.ReLU()),
    ('output' , nn.Linear(500,10))
#     10 is if u have 10 classes
]))

transfer_model.classifier = classifier
print("\n Classifier after changing \n" ,transfer_model.classifier)

# another way of adding a classifier
class AnotherClassifier(nn.Module):
    def __init__(self):
        super(AnotherClassifier , self).__init__()
        self.fc1 = nn.Linear(1024 , 256)
        self.fc2 = nn.Linear(256 , 128)
        self.fc3 = nn.Linear(128 , 64)
        self.fc4 = nn.Linear(64 , 10)       
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self , x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.fc4(x)
        return x
    
print("\n Another way of adding the classifier \n")
classifier = AnotherClassifier()
transfer_model.classifier = classifier
print("\n Classifier after changing \n" ,transfer_model.classifier)

In [0]:
# We need to retrain the classifier part and keep the feature part static
# here param refers to the weights and biases since parameter class has the weights and biases
for param in transfer_model.parameters():
    param.requires_grad = False
# This will make sure all the parameters are frozen and we dont compute their gradients hence making the execution faster
# Since we only want to update the parameters of the classifier we will do 
optimizer = optim.Adam(transfer_model.classifier.parameters() , lr = 0.001)
# this will leave the weights of the feature detector static and update weights and biases of the classifier

# Tips
##Watch those shapes<br/>
In general, you'll want to check that the tensors going through your model and other code are the correct shapes. Make use of the .shape method during debugging and development.

##A few things to check if your network isn't training appropriately
Make sure you're clearing the gradients in the training loop with optimizer.zero_grad(). If you're doing a validation loop, be sure to set the network to evaluation mode with model.eval(), then back to training mode with model.train().

##CUDA errors
Sometimes you'll see this error:

RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #1 ‘mat1’

You'll notice the second type is torch.cuda.FloatTensor, this means it's a tensor that has been moved to the GPU. It's expecting a tensor with type torch.FloatTensor, no .cuda there, which means the tensor should be on the CPU. PyTorch can only perform operations on tensors that are on the same device, so either both CPU or both GPU. If you're trying to run your network on the GPU, check to make sure you've moved the model and all necessary tensors to the GPU with .to(device) where device is either "cuda" or "cpu".

##Data Normalization
Data normalization make our model train and reach a minimum error, faster!

Data normalization is typically done by subtracting the mean (the average of all pixel values) from each pixel, and then dividing the result by the standard deviation of all the pixel values. Sometimes you'll see an approximation here, where we use a mean and standard deviation of 0.5 to center the pixel values. 

## Using Validation loss
Check for validation loss in each epoch and find if it is minimum than the previous one. If yes then save the weights of the model. Hence you will always have the best model weights. Make the initial minimum as infinity by using np.Inf.<br/>
Before testing the model load it. Otherwise you will have a model which doesnt have the best validation accuracy.

## Image augmentation
transforms.compose is used for data augmentation.

In [0]:
trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)
print("Length of training dataset before splitting {}".format(len(trainset)))
trainset , validationset = torch.utils.data.random_split(trainset , [55000,5000])
print("Length of training set {} and validation set {} after splitting".format(len(trainset) , len(validationset)))

In [0]:

trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True)
validationloader = torch.utils.data.DataLoader(validationset, batch_size=64,
                                          shuffle=True)
# to see the labels have been split 
images , labels = next(iter(validationloader))
import matplotlib.pyplot as plt
%matplotlib inline
print(labels[5])
print(images.shape)
# vary the first parameter to view the 64 images
plt.imshow(images[5][0] , cmap = 'gray')
plt.plot()