# Feed Forward Neural Network on MNIST dataset -  a simple reference

In [4]:
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [5]:
input_size = 784              # Input of 28x28 pixels = 784 features
hidden_size = 400             # no of neurons in hidden layer
output_size = 10              # classification of 0-9 digits hence, 10 neurons
epochs = 10                   # we show data to the model 10 times
batch_size = 100              # in a batch of 100 per iteration
learning_rate = 0.001         # how fast we're moving towards optimum during optimization

In [6]:
training_set = datasets.MNIST(root = './data', train = True, transform = transforms.ToTensor(), download = True)
test_set = datasets.MNIST(root = './data', train = False, transform = transforms.ToTensor())

In [7]:
train_loader = torch.utils.data.DataLoader(dataset = training_set, batch_size = batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_set, batch_size = batch_size, shuffle = True)

![nn mnist](https://user-images.githubusercontent.com/30661597/61593615-5eb8bf00-ac14-11e9-8087-f880971b3543.png)


Our network architecture is very simple one with just 3 layers. Input with 784 neurons, hidden with 400 and finally output layer for predicting which class those digits belong to with 10 neurons. We use ReLU function as activation for all layers.



### Building the Feed Forward Neural Network

In [10]:
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        
        """ Initializes our Model class with attributes and objects inherited from nn.Module class which already has many pre-determined functions"""
        super(Model, self).__init__()
        # Input layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # Hidden Layer
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        # Output layer
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.init_weights()
        
    
    def init_weights(self):
        """ Initializes weights for forward propogation for both input and hidden layer. We don't need to initialize for output_layer"""
        nn.init.kaiming_normal(self.fc1.weight)
        nn.init.kaiming_normal(self.fc2.weight)
        
    def forward(self, x):
        """ Forward propogation through the layers"""
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [13]:
# Create an object for our model to be called as
net = Model(input_size = input_size, hidden_size = hidden_size, output_size = output_size)
CUDA = torch.cuda.is_available()
if CUDA:
    net = net.cuda
# Loss function is Cross Entropy loss for multi class classification. No need to specify Softmax () as it comes along with it.
criterion = nn.CrossEntropyLoss()
# Calling Adam optmiizer
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

  nn.init.kaiming_normal(self.fc1.weight)
  nn.init.kaiming_normal(self.fc2.weight)


### Training the network

In [18]:
for epoch in range(epochs):
    correct_train = 0
    running_loss = 0
    for i , (images, labels) in enumerate(train_loader):
        """ Flatten the image from size (batch,1,28,28) --> (100,1,28,28) where 1 represents the number of channels (grayscale-->1),
         to size (100,784) and wrap it in a variable """
        
        images = images.view(-1, 28*28)    
        
        if CUDA:
            images = images.cuda()
            labels = labels.cuda()
        
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        correct_train += (predicted == labels).sum()
        loss = criterion(outputs, labels)                  # Difference between actual and predicted is the loss
        running_loss += loss.item()
        
        optimizer.zero_grad()                              # Flush existing gradients to avoid accumulation
        loss.backward()                                    # Compute gradients
        optimizer.step()                                   # Update weights
        
    print('Epoch [{}/{}], Training Loss: {:.3f}, Training Accuracy: {:.3f}%'.format
          (epoch+1, epochs, running_loss/len(train_loader), (100*correct_train.double()/len(training_set))))
print(">>>>>NETWORK FINISHED TRAINING!<<<<<<<<<")   
        

Epoch [1/10], Training Loss: 0.086, Training Accuracy: 97.352%
Epoch [2/10], Training Loss: 0.055, Training Accuracy: 98.243%
Epoch [3/10], Training Loss: 0.040, Training Accuracy: 98.670%
Epoch [4/10], Training Loss: 0.028, Training Accuracy: 99.082%
Epoch [5/10], Training Loss: 0.024, Training Accuracy: 99.228%
Epoch [6/10], Training Loss: 0.020, Training Accuracy: 99.320%
Epoch [7/10], Training Loss: 0.015, Training Accuracy: 99.478%
Epoch [8/10], Training Loss: 0.016, Training Accuracy: 99.470%
Epoch [9/10], Training Loss: 0.014, Training Accuracy: 99.538%
Epoch [10/10], Training Loss: 0.012, Training Accuracy: 99.602%
>>>>>NETWORK FINISHED TRAINING!<<<<<<<<<


In [21]:
with torch.no_grad():
    correct = 0
    for images, labels in test_loader:
        if CUDA:
            images = images.cuda()
            labels = labels.cuda()
        images = images.view(-1, 28*28)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / len(test_set)))

Accuracy of the network on the 10000 test images: 98.07 %


###                                                                           THE END