- RNN is an extension of FNN

In [43]:
import torch            ## for creating tensors
import torch.nn as nn   ## for modeling
import torchvision.transforms as transforms    ## make data iterable
import torchvision.datasets as dsets           ## make data iterable


from torch.autograd import Variable  ## make variable for enable gradients
import pandas as pd


###### Steps:
(Exactly same as Logistic Regression)
- Step 1: Load dataset
- Step 2: Make dataset iterable
- Step 3: Create model class
- Step 4: Instantiate model class
- Step 5: Instantiate loss class
- Step 6: Instantiate optimizer class
- Step 7: Train model

## Step 1: Loading Dataset

In [45]:
train_df = dsets.MNIST(root='./data',
                      train=True,
                      transform=transforms.ToTensor(),
                      download=True)

test_df = dsets.MNIST(root='./data',
                      train=False,
                      transform=transforms.ToTensor(),
                      download=True)


## Step 2: Make Dataset Iterable

In [46]:
batch_size = 100
n_iters = 3000
n_epochs = int(n_iters / (len(train_df) / batch_size))


train_loader = torch.utils.data.DataLoader(dataset=train_df,
                                          batch_size=batch_size,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_df,
                                          batch_size=batch_size,
                                          shuffle=False)


## Step 3: Create Model Class

In [47]:
class FeedForwardNeuralNetModel(nn.Module):
    # Here, hidden_size is extra then logistic regression
    # It denotes how many non-linear dimension are there
    def __init__(self, input_size, hidden_size, num_classes): 
        super(FeedForwardNeuralNetModel, self).__init__()
        
        # Linear function 1: 784 --> 100
        # input to hidden
        self.fc1 = nn.Linear(input_dim, hidden_dim) 
        
        ##################
        # Non-linearity #
        #################
        
        #----- Model A: 1 Hidden Layer FeedForward Neural Network (Sigmoid Activation) -----#
        #self.sigmoid = nn.Sigmoid()
        
        #----- Model B: 1 Hidden Layer FeedForward Neural Network (Tanh Activation) -----#
        #self.tanh = nn.Tanh()
        
        #----- Model C: 1 Hidden Layer FeedForward Neural Network (RELU Activation) -----#
        self.relu1 = nn.ReLU()
        
        
        
        
        
        
        # Linear function 2: 100 --> 100
        # hidden to hidden
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
           
        #----- Model D: 2 Hidden Layer FeedForward Neural Network (RELU Activation) -----#
        self.relu2 = nn.ReLU()
        
        
         
            
        
        # Linear function 3: 100 --> 100
        # hidden to hidden
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
             
        #----- Model E: 3 Hidden Layer FeedForward Neural Network (RELU Activation) -----#
        self.relu3 = nn.ReLU()
        
        


        
        # Linear function 4 (readout): 100 --> 10
        self.fc4 = nn.Linear(hidden_dim, output_dim)
        
        
        


    # Linear --> Non-linear --> Linear
    def forward(self, x):
        # Linear function 1
        out = self.fc1(x)
        
        # Non-linearity 1
        #out = self.sigmoid(out)
        #out = self.tanh(out)
        out = self.relu1(out)
        
        
        # Linear function 2
        out = self.fc2(out)
        # Non-linearity 2
        out = self.relu2(out)
        
        
        # Linear function 3
        out = self.fc3(out)
        # Non-linearity 3
        out = self.relu3(out)
        
        
        # Linear function 4 (readout)
        out = self.fc4(out)
        
        return out
    
    
    
    
    
        
        

## Step 4: Instantiate Model Class

- **Input** dimension: 784
    - Size of image
    - 28*28=784
- **Output** dimension: 10
    - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
- **Hidden** dimension: 100
    - Can be any number
    - Similar term
        - Number of neurons
        - Number of non-linear activation functions

In [48]:
input_dim = 28*28
hidden_dim = 100 ## extra than logistic regression
output_dim = 10

model = FeedForwardNeuralNetModel(input_dim, hidden_dim, output_dim)

## Step 5: Instantiate Loss Class
- Feedforward Neural Network
    - Cross Entropy Function
- Logistic Regression
    - Cross Entropy Function
- Linear Regression
    - Mean Squared Error (MSE) Function

In [49]:
criterion = nn.CrossEntropyLoss()

###### What happens in nn.CrossEntropyLoss()
- Computes softmax (logistic/softmax function)
- Computes cross entropy

## Step 6: Instantiate Optimizer Class
- parameters = parameters - learning_rate * parameters_gradients
- **At every iteration, we update our model's parameters**


###### What is the purpose fo the optimizer class
- Update the model's parameter at every iteration
- So that, we can get a better model to do the predictions

In [50]:
learning_rate = 0.001

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

### Parameters In-Depth
- For more clarification watch the video no 29 (from 13min)

In [51]:
print(model.parameters())

print(len(list(model.parameters())))


# Fully Connect (FC) 1 Parameters --> A1
print(list(model.parameters())[0].size())


# FC 1 Bias Parameters --> B1
print(list(model.parameters())[1].size())


# FC 2 Parameters --> A2
print(list(model.parameters())[2].size())


# FC 2 Bias Parameters --> B2
print(list(model.parameters())[3].size())



<generator object Module.parameters at 0x122c584f8>
8
torch.Size([100, 784])
torch.Size([100])
torch.Size([100, 100])
torch.Size([100])


## Step 7: Train Model

In [52]:
iter = 0

for epoch in range(n_epochs): ## this loop will go through all 60,000 images 5 times
    for idx, (images, labels) in enumerate(train_loader): ## this loop will go through all 60,000 images once
        # Load images as Variable
        images = Variable(images.view(-1, 28*28))
        labels = Variable(labels)
        
        # Clear gradients w.r.t. parameter
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        outputs = model(images)
        
        # Calcultae Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        #Calculate gradients w.r.t. parameters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        
        iter += 1
        
        if iter%500==0:
            # Calculate accuracy
            correct = 0
            total = 0
            
            # Iterate through the test dataset
            for images, labels in test_loader:
                
                # Load images to a Torch Variable
                images = Variable(images.view(-1, 28*28))
                
                # Forward pass only the to get logits/output
                outputs = model(images)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                
                # Total number of lables
                total += labels.size(0)
                
                # Total correct predictions
                correct += (predicted == labels).sum()
                
            accuracy = 100 * correct / total
            
            # Print loss
            print('Iteration: {}. Loss: {}. Accuracy: {}.'.format(iter, loss.data, accuracy))
                
                
        
        

Iteration: 500. Loss: 2.2923882007598877. Accuracy: 9.
Iteration: 1000. Loss: 2.298668622970581. Accuracy: 8.
Iteration: 1500. Loss: 2.288738965988159. Accuracy: 10.
Iteration: 2000. Loss: 2.2903330326080322. Accuracy: 16.
Iteration: 2500. Loss: 2.28289794921875. Accuracy: 18.
Iteration: 3000. Loss: 2.2831785678863525. Accuracy: 20.


#### Deep Learning
- 2 ways to expand a neural network
    - More non-linear activation units/neurons/hidden dimension
        - Here we put 100 hidden layer
        - We can make it larger
    - More hidden layer
        - Here we did upto 3 hidden layer
- Cons
    - Need a larger dataset
        - Curse od dimentionality
    - Does not necessarily mean higher accuracy