# Homework 2

In [1]:
%matplotlib inline

import torch
import torchvision
from torch import nn

import numpy 
import matplotlib.pyplot as plt
import time

In [2]:
#get the data

training_data = torchvision.datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

test_data = torchvision.datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor()
)


In [3]:
#functions

def train_one_epoch(dataloader, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # forward pass
        pred = model(X)
        loss = loss_fn(pred, y) # X is input (images), y is label (0-9)
        
        # backward pass calculates gradients
        loss.backward()
        
        # take one step with these gradients
        optimizer.step()
        
        # resets the gradients 
        optimizer.zero_grad()

def evaluate(dataloader, model, loss_fn):
    # Set the model to evaluation mode - some NN pieces behave differently during training
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss, correct = 0, 0

    # We can save computation and memory by not calculating gradients here - we aren't optimizing 
    with torch.no_grad():
        # loop over all of the batches
        for X, y in dataloader:
            pred = model(X)
            loss += loss_fn(pred, y).item()
            # how many are correct in this batch? Tracking for accuracy 
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss /= num_batches
    correct /= size
    
    accuracy = 100*correct
    return accuracy, loss

def show_failures(model, dataloader, maxtoshow=10):
    model.eval()
    batch = next(iter(dataloader))
    predictions = model(batch[0])
    
    rounded = predictions.argmax(1) #dimensions=1
    errors = rounded!=batch[1] #X, y so y = label
    print('Showing max', maxtoshow, 'first failures. '
          'The predicted class is shown first and the correct class in parentheses.')
    ii = 0
    plt.figure(figsize=(maxtoshow, 1))
    for i in range(batch[0].shape[0]):
        if ii>=maxtoshow:
            break
        if errors[i]:
            plt.subplot(1, maxtoshow, ii+1)
            plt.axis('off')
            plt.imshow(batch[0][i,0,:,:], cmap="gray")
            plt.title("%d (%d)" % (rounded[i], batch[1][i]))
            ii = ii + 1

class NonlinearClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers_stack = nn.Sequential(
            nn.Linear(28*28, 50),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
           # nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
           # nn.Dropout(0.2),
            nn.Linear(50, 10)
        )


        
    def forward(self, x):
        x = self.flatten(x)
        x = self.layers_stack(x)

        return x

class LinearClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        
        # First, we need to convert the input image to a vector by using 
        # nn.Flatten(). For MNIST, it means the second dimension 28*28 becomes 784.
        self.flatten = nn.Flatten()
        
        # Here, we add a fully connected ("dense") layer that has 28 x 28 = 784 input nodes 
        #(one for each pixel in the input image) and 10 output nodes (for probabilities of each class).
        self.layer_1 = nn.Linear(28*28, 10)
        
    def forward(self, x):

        x = self.flatten(x)
        x = self.layer_1(x)

        return x

## How does *batch size* effect the quality of the model?

In [4]:
%%time
batch_sizes = [32, 64, 128, 256, 512]
    # batch_size = 128
for batch_size in batch_sizes: 

    train_size = int(0.8 * len(training_data))  # 80% for training
    val_size = len(training_data) - train_size  # Remaining 20% for validation
    training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))
    

    print(f'*************** Batch size = {batch_size} ********************') 
    nonlinear_model = NonlinearClassifier()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)
    
    # The dataloader makes our dataset iterable 
    train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
    val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)
    
    
    
    epochs = 5
    train_acc_all = []
    val_acc_all = []
    for j in range(epochs):
        train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer)
        
        # checking on the training loss and accuracy once per epoch
        acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
        train_acc_all.append(acc)
        print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
        
        # checking on the validation loss and accuracy once per epoch
        val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
        val_acc_all.append(val_acc)
        print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")

    #finally, evaluate how it performs against the test data: 
    batch_size_test = 256
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
    acc_test, loss_test = evaluate(test_dataloader, nonlinear_model, loss_fn)
    print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

*************** Batch size = 32 ********************
Epoch 0: training loss: 0.3008524134270847, accuracy: 90.67708333333333
Epoch 0: val. loss: 0.3002712804178397, val. accuracy: 90.61666666666667
Epoch 1: training loss: 0.16908632882746558, accuracy: 94.91875
Epoch 1: val. loss: 0.17410429557661217, val. accuracy: 94.69999999999999
Epoch 2: training loss: 0.13618672937620432, accuracy: 95.93958333333333
Epoch 2: val. loss: 0.14839355806509655, val. accuracy: 95.5
Epoch 3: training loss: 0.11744997482653707, accuracy: 96.4125
Epoch 3: val. loss: 0.13087551322206856, val. accuracy: 95.89166666666667
Epoch 4: training loss: 0.0946422436911768, accuracy: 97.1875
Epoch 4: val. loss: 0.11517379763908685, val. accuracy: 96.575
Test loss: 0.1177, test accuracy: 96.43%
*************** Batch size = 64 ********************
Epoch 0: training loss: 0.4371504230300585, accuracy: 87.20052083333333
Epoch 0: val. loss: 0.4506137156486511, val. accuracy: 86.92708333333333
Epoch 1: training loss: 0.283

## Answer

It looks like the smaller the batch size, the higher the final test accuracy. As batch size increased, test accuracy decreased. However, the smaller the batch size, the more computationally intensive it is to train the model. So opting for a small-to-medium batch size (i.e. 64, 128) can be a good "sweet" spot for acheiving high accuracy but also maintaining manageable wall-time. 

## How does *learning rates* effect the quality of the model?

In [5]:
%%time
learning_rates = [0.01, 0.05, 0.1, 0.2]
    # batch_size = 128
for lr_ in learning_rates: 
    
    train_size = int(0.8 * len(training_data))  # 80% for training
    val_size = len(training_data) - train_size  # Remaining 20% for validation
    # print(f'train_size: {train_size}')
    # print(f'val_size: {val_size}')
    training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))
    
    batch_size = 128 #keep constant
    print(f'*************** Learning Rate = {lr_} ********************') 
    nonlinear_model = NonlinearClassifier()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=lr_)
    
    # The dataloader makes our dataset iterable 
    train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
    val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)
    
    
    
    epochs = 5
    train_acc_all = []
    val_acc_all = []
    for j in range(epochs):
        train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer)
        
        # checking on the training loss and accuracy once per epoch
        acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
        train_acc_all.append(acc)
        print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
        
        # checking on the validation loss and accuracy once per epoch
        val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
        val_acc_all.append(val_acc)
        print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")

    #finally, evaluate how it performs against the test data: 
    batch_size_test = 256
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
    acc_test, loss_test = evaluate(test_dataloader, nonlinear_model, loss_fn)
    print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

*************** Learning Rate = 0.01 ********************
Epoch 0: training loss: 2.2963698045994207, accuracy: 13.19938962360122
Epoch 0: val. loss: 2.2967900691493863, val. accuracy: 12.487283825025433
Epoch 1: training loss: 2.285515523538357, accuracy: 15.895218718209565
Epoch 1: val. loss: 2.2859163822666293, val. accuracy: 15.106815869786368
Epoch 2: training loss: 2.2712665146928495, accuracy: 25.406917599186162
Epoch 2: val. loss: 2.2716645348456597, val. accuracy: 24.974567650050865
Epoch 3: training loss: 2.2512357913381686, accuracy: 23.963631739572737
Epoch 3: val. loss: 2.251570301671182, val. accuracy: 23.957273652085455
Epoch 4: training loss: 2.219492443208772, accuracy: 28.560528992878943
Epoch 4: val. loss: 2.2196949297381985, val. accuracy: 28.713123092573756
Test loss: 2.2184, test accuracy: 28.77%
*************** Learning Rate = 0.05 ********************
Epoch 0: training loss: 2.276081251375603, accuracy: 10.610395803528851
Epoch 0: val. loss: 2.2755908870697024, 

## Answer

In this case, a larger learning rate increased test accuracy by the final epoch. Not sure if there is an error in my code, since each time my model starts its first epoch, it has an accuracy that's better than the first epoch of the last learning rate test. 

## How does *activation function* effect the quality of the model?

In [6]:
%%time

########### Nonlinear Model

train_size = int(0.8 * len(training_data))  # 80% for training
val_size = len(training_data) - train_size  # Remaining 20% for validation
# print(f'train_size: {train_size}')
# print(f'val_size: {val_size}')
training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))

batch_size = 128 #keep constant
print(f'*************** Nonlinear Model ********************') 
nonlinear_model = NonlinearClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)

# The dataloader makes our dataset iterable 
train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)



epochs = 5
train_acc_all = []
val_acc_all = []
for j in range(epochs):
    train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer)
    
    # checking on the training loss and accuracy once per epoch
    acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
    train_acc_all.append(acc)
    print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
    
    # checking on the validation loss and accuracy once per epoch
    val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
    val_acc_all.append(val_acc)
    print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")

#finally, evaluate how it performs against the test data: 
batch_size_test = 256
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
acc_test, loss_test = evaluate(test_dataloader, nonlinear_model, loss_fn)
print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

############ Linear Model


train_size = int(0.8 * len(training_data))  # 80% for training
val_size = len(training_data) - train_size  # Remaining 20% for validation
# print(f'train_size: {train_size}')
# print(f'val_size: {val_size}')
training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))

batch_size = 128 #keep constant
print(f'*************** Linear Model ********************') 
linear_model = LinearClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)

# The dataloader makes our dataset iterable 
train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)

epochs = 5
train_acc_all = []
val_acc_all = []
for j in range(epochs):
    train_one_epoch(train_dataloader, linear_model, loss_fn, optimizer)
    
    # checking on the training loss and accuracy once per epoch
    acc, loss = evaluate(train_dataloader, linear_model, loss_fn)
    train_acc_all.append(acc)
    print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
    
    # checking on the validation loss and accuracy once per epoch
    val_acc, val_loss = evaluate(val_dataloader, linear_model, loss_fn)
    val_acc_all.append(val_acc)
    print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")
    
#finally, evaluate how it performs against the test data: 
batch_size_test = 256
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
acc_test, loss_test = evaluate(test_dataloader, linear_model, loss_fn)
print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))




*************** Nonlinear Model ********************
Epoch 0: training loss: 2.2846088876911237, accuracy: 10.852352119236143
Epoch 0: val. loss: 2.288375597733718, val. accuracy: 10.800744878957168
Epoch 1: training loss: 2.2394960300595153, accuracy: 31.936034777208512
Epoch 1: val. loss: 2.2445620206686168, val. accuracy: 30.54003724394786
Epoch 2: training loss: 2.085206672257068, accuracy: 34.29591678310822
Epoch 2: val. loss: 2.0966731218191295, val. accuracy: 33.14711359404097
Epoch 3: training loss: 1.713658886797288, accuracy: 46.685297314081666
Epoch 3: val. loss: 1.7454931735992432, val. accuracy: 43.637492240844196
Epoch 4: training loss: 1.209678187089808, accuracy: 65.11411271541687
Epoch 4: val. loss: 1.263515564111563, val. accuracy: 62.321539416511484
Test loss: 1.2112, test accuracy: 64.08%
*************** Linear Model ********************
Epoch 0: training loss: 2.308105433859476, accuracy: 9.452639751552795
Epoch 0: val. loss: 2.301988861777566, val. accuracy: 11.01

## Answer

A purely linear activation function produces a model with low accuracy. Creating a neural network that has layers of both linear and nonlinear activation functions increases accuracy. 

# Bonus: What is a learning rate scheduler?

## Answer

A learning rate scheduler is a scheduled framework that changes the learning rate between epochs or iterations during training. It is often used to describe a framework that gradually decays the learning rate during training. 

resource: https://neptune.ai/blog/how-to-choose-a-learning-rate-scheduler