This notebook illustrates how to train Fully Connected models with BP. We train and test the model on MNIST.

#### Import libraries

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from torch.autograd import Variable

import copy

import matplotlib.pyplot as plt
import numpy as np
import psutil
import os
import multiprocessing as mp
import time

#### Define Network architecture

In [30]:
# models with Dropout
class NetFC1x1024DOcust(nn.Module):
    def __init__(self):
        super().__init__()      #mnist
        self.fc1 = nn.Linear(28**2,128,bias=True)
        self.fc2 = nn.Linear(128,10,bias=True)
        # self.fc3 = nn.Linear(64,10,bias=True)
 
        # initialize the layers using the He uniform initialization scheme
        #fc1_nin = 32*32*3 # Note: if dataset is MNIST --> fc1_nin = 28*28*1
        fc1_nin = 28*28*1
        fc1_limit = np.sqrt(6.0 / fc1_nin)
        torch.nn.init.uniform_(self.fc1.weight, a=-fc1_limit, b=fc1_limit)
        fc2_nin = 128
        fc2_limit = np.sqrt(6.0 / fc2_nin)
        torch.nn.init.uniform_(self.fc2.weight, a=-fc2_limit, b=fc2_limit)
        # fc3_nin = 64
        # fc3_limit = np.sqrt(6.0 / fc3_nin)
        # torch.nn.init.uniform_(self.fc3.weight, a=-fc3_limit, b=fc3_limit)
        

    def forward(self, x, do_masks):
        x = x.view(x.size(0), -1) 
        x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        # apply dropout --> we use a custom dropout implementation because we need to present the same dropout mask in the two forward passes
        if do_masks is not None:
            x = x * do_masks[0]   
        x = F.softmax(self.fc2(x), dim=1)
        return x

In [34]:
def train_model():
    print('in train_model')
    exp_name = '1--MNIST_BP'
    train_epochs = 100
    learning_rate = 0.1
    print('Learning rate:',learning_rate)
    dropout = 0.9
    keep_rate = dropout
    eta_decay = True # to be removed
    decay_scheme = 1
    seed = None
    dataset = 'mnist'
    w_init = 'he_uniform'
    # network set-up
    learn_type = 'BP' # current options are BP, ERIN
    optim_choice = 'mom' # current options are SGD, mom(entum)
    batch_size = 64
    print('Batch size:',batch_size)
    model = 'NetFC1x1024DOcust'
    dataset = 'mn'

    criterion = nn.CrossEntropyLoss()

    # initialize the network
    net = NetFC1x1024DOcust()
    
    #optim_choice = 'SGD'
    if optim_choice == 'SGD':
        gamma = 0
    elif optim_choice == 'mom':
        gamma = 0.9
    
    # load the dataset
    transform = transforms.Compose(
        [transforms.ToTensor()]) # this normalizes to [0,1]
    trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                              shuffle=True, num_workers=2)
    testset = torchvision.datasets.MNIST(root='./data', train=False,
                                           download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                             shuffle=False, num_workers=2)


    if optim_choice == 'SGD':
        optimizer = optim.SGD(net.parameters(), lr=learning_rate)
    elif optim_choice == 'mom':
        optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=gamma)
    elif optim_choice == 'adam':
        optimizer = optim.Adam(net.parameters(), lr=0.001)
    
    # # # start = torch.cuda.memory_allocated(device)
    # # # print(f"Starting at start: {start} memory usage as baseline.")
    # # # net.to(device)
    # # # after_model =  torch.cuda.memory_allocated (device) - start
    # # # print(f"1: After model to device: {after_model:,}")
    # # # print("")

    # Train and test the model
    test_accs = []
    train_losses = []
    train_accs = []

    for epoch in range(train_epochs):  # loop over the dataset multiple times
        net.train()

        # # learning rate decay
        if epoch in [30,60,90]: 
            learning_rate = learning_rate*0.1
        
        # loop over batches
        running_loss = 0.0
        batch_count = 0
        total_train = 0
        correct_train = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, target = data
            optimizer.zero_grad()
    
            # # # a = torch.cuda.memory_allocated(device)  - start
            # # # outputs = net(inputs.to(device),do_masks=None)
            # # # b = torch.cuda.memory_allocated(device) - start
            # # # print(f"2: Memory consumed after forward pass (activations stored, depends on batch size): {b:,} change: ", f'{b - a:,}' )  # batch * num layers * hidden_size * 4 bytes per float

            outputs = net(inputs,do_masks=None)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)  #returns index of column of max values of every row
            total_train += target.size(0) #size of rows of test_lbl
            correct_train += (predicted == target).sum().item()
            
            # # # loss = criterion(outputs, target.to(device))
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            
            # # # c = torch.cuda.memory_allocated(device) - start
            # # # print(f"3: After backward pass (activations released, grad stored) {c:,} change: {c-b:,}")

            # # # optimizer.step()
            # # # d = torch.cuda.memory_allocated(device)  - start
            # # # print(f"4: After optimizer step (moments stored at first time): {d:,} change: {d-c:,} " )
         
            # print statistics
            loss = criterion(outputs, target)
            running_loss += loss.item()         #.item() transforms pytorch tensor in python number (values of loss and loss.item() are the same)
                                              # running_loss stores loss for every epoch (sum of loss for every batch)
            
            # # # net.zero_grad()  
            # # # e = torch.cuda.memory_allocated(device)  - start
            # # # print(f"5: After zero_grad step (grads released): {e:,} change: {e-d:,} " )
            # # # print("") 

            batch_count += 1
        curr_loss = running_loss / batch_count    # sum of losses until now in the epoch / epochs done -> average loss per batch 
        print('[%d, %5d] loss: %.3f' %
        (epoch, batch_count, curr_loss))
        train_losses.append(curr_loss)
        print('Train accuracy epoch {}: {} %'.format(epoch, 100 * correct_train / total_train))
        train_accs.append(100 * correct_train / total_train)

        print('Testing...')
        net.eval()
        correct_test = 0
        total_test = 0
        # since we're not training, we don't need to calculate the gradients for our outputs
        with torch.no_grad():
            for test_data in testloader:
                test_images, test_labels = test_data
                # calculate outputs by running images through the network
                test_outputs = net(test_images,do_masks=None)
                # the class with the highest energy is what we choose as prediction
                _, predicted = torch.max(test_outputs.data, 1)
                total_test += test_labels.size(0)
                correct_test += (predicted == test_labels).sum().item()

        print('Test accuracy epoch {}: {} %'.format(epoch, 100 * correct_test / total_test))
        test_accs.append(100 * correct_test / total_test)
    
    print('Finished Training')

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss', color="green")
    plt.xlabel('Epoch')
    plt.ylabel('Loss') 
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(test_accs, label='Test Accuracy', color="green")
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.yticks(np.arange(min(test_accs)-1, max(test_accs)+2, 1))
    plt.legend()
    plt.show()

    plt.figure(figsize=(5, 5))
    plt.plot(test_accs, label='Test Accuracy', color="green")
    plt.plot(train_accs, label='Train Accuracy', color="red")
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.show()

    plt.figure(figsize=(5, 5))
    plt.plot(train_accs, label='Train Accuracy', color="red")
    plt.xlabel('Epoch')
    plt.ylabel('Train accuracy (%)')
    plt.legend()
    plt.show()

In [None]:
if __name__ == '__main__':
    train_model()