In [None]:
import torch
from torchvision import transforms
from torchvision.datasets import MNIST 
from torch.autograd import gradcheck
import numpy as np
import matplotlib.pyplot as plt

# **Classification**
In this part we will explore the performance of using different activation units to train on a subset of MNIST dataset

Though we will using pytorch to train the model, 
You don't have to know the function definition and the class from pytorch.

After doing this coding assignment, you will see how we can explore different activation function on training a fully-connected neural network. 
And see how we can visualize and compare different activation units by their learning rates and prediction accuracies.

First, we load the dataset and turn it into numpy arrays

In [None]:
mnist_train_set = MNIST("Data", download = True, train = True)
mnist_test_set = MNIST("Data", download = True, train = False)
mnist_trainX = np.array(mnist_train_set.data.numpy())
mnist_trainY = np.array(mnist_train_set.targets.numpy())
mnist_testX = np.array(mnist_test_set.data.numpy())
mnist_testY = np.array(mnist_test_set.targets.numpy())
mnist_trainX = mnist_trainX.reshape((mnist_trainX.shape[0], -1))
mnist_testX = mnist_testX.reshape((mnist_testX.shape[0], -1))


### (a) Visualization

Visualize a mnist data and its label. 

In [None]:
#############################################################################################################
# TODO: randomly select 3 indices for training data and 3 indices for test data,
#       and use matplotlib "imshow" function to plot the the data as figure.
#       The training images are in the numpy array "mnist_trainX", the training labels are in the numpy array "mnist_trainY"
#       The test     images are in the numpy array "mnist_testX" , the test     labels are in the numpy array "mnist_testY"
# Hint: we have flattened the images, so you may have to convert them back to plot the images.
#############################################################################################################

############################################################### TODO A

############################################################### TODO A

In the cell below, split the validation set with fraction 0.2

### (b) Split training set and the validation set

Split training set and the validation set.

First, you should get 20\% of the "mnist_trainX" as our data, because we just want to use a subset of the original training set.
Then, split 80\% of the data to be our training data, and the remaining 20\% as our validation data.

In [None]:
#############################################################################################################
# Please Fill in the cell to implement X_train, Y_train, X_valid, Y_valid in numpy array
# Also provide the number of data in n_valid, and n_test
# The raw data is loaded in the numpy arrays "mnist_trainX" and "mnist_trainY"
#     Definition of the required variable:
#          X_train: the training images
#          Y_train: the training labels
#          X_valid: the validation images
#          Y_train: the validation labels
#          n_train: the number of training data
#          n_valid: the number of validation data
#############################################################################################################

# the split fraction
train_frac = 0.2
valid_frac = 0.2

############################################# TODO B
n_train = 0
n_valid = 0

X_train = np.array([])
Y_train = np.array([])

X_valid = np.array([])
Y_valid = np.array([])
#############################################
# renaming the test data
X_test = mnist_testX
Y_test = mnist_testY
n_test = mnist_testX.shape[0]
# Turn the numpy array to pytorch tensor
X_train = torch.Tensor(X_train)
Y_train = torch.from_numpy(Y_train)

X_valid = torch.Tensor(X_valid)
Y_valid = torch.from_numpy(Y_valid)

X_test = torch.Tensor(X_test)
Y_test = torch.from_numpy(Y_test)

print("Number of training data: {}".format(n_train))
print("Number of validation data: {}".format(n_valid))
print("Number of test data: {}".format(n_test))

### **(c) Implementing activation functions: Mish and Swish**
Pytorch provides many activation units under torch.nn.Module.

We can just use torch.nn.{Module Name} to instantiates the module. [Here is the reference of the official documentation on the types of activation functions and how to use them](https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity)

Moreover, we can also define new activation units.

In this problem, we will implement Mish and Swish activation function

The original paper for Mish : https://arxiv.org/abs/1908.08681

The original paper for Swish: https://arxiv.org/abs/1710.05941

The Mish function is defined as:

$f_{Mish}(x) = x(tanh(ln(1+e^x)))$

And the Swish function is defined as: 

$f_{Swish}(x) = x(sigmoid(\beta x)) = \frac{x}{( 1 + e^{\beta x})}$, where $\beta$ is a learnable weight 

Implement the two function by completing forward and backward(gradient) method.
You can use torch.gradcheck to see if your gradient is computed correctly. The torch.gradcheck compares the analytical gradients to numerical gradients, in which the analytical gradients are obtained by the backward methods and the numerical gradients are obtained by introducing small difference to the input.

If your implementation is corrent. The gradcheck function will return True.
For more implemenation details, you can see the official note from pytorch: https://pytorch.org/docs/stable/notes/extending.html

In [None]:
# This is an example of how to implement an autograd function in pytorch
# the forward function compute the output of the function 
# the backward function computes the gradient with respect to the input (and other parameters like weights)
# For activation function, this can be easily done by chain rule.
# As the example, the gradient w.r.t input equals the gradient w.r.t its output multiplied 
#    by the gradient of its output w.r.t its input 
# The ctx.saved_tensors can save the input and some tempory results from forward method
# In the backward method, call ctx.saved_tensors to load the input and tempory results
class SigmoidFunction(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x):
        output = 1.0/ ( 1 + torch.exp(-x))
        ctx.save_for_backward(output)   # save for backward function
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        output, = ctx.saved_tensors
        grad_input = output * (1 - output) * grad_output
        return grad_input

class MishFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ##########################################################################################
        # TODO: implement the forward function for mish
        #       You should compute the intermediate result of the tanh function and the output:
        #       Input: 
        #           x: the input variable to the function
        #       Variable to implement:
        #           tanh_ : compute the tanh( ln (1+e^x) )
        #           output: the output of the mish function, using the intermediate result tanh_
        # Hint: using functions provided by pytorch: torch.tanh, torch.log, and torch.exp
        #       We will use the intermediate result to reduce computation loads for the backward function
        ##########################################################################################
        ################################################### TODO C_1
        tanh_ = 0
        output = 0 
        ###################################################
        ctx.save_for_backward(x, tanh_)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        x, tanh_ = ctx.saved_tensors
        ##########################################################################################
        # TODO: implement the backward function for mish
        #       The original input x and the intermediate result tanh_ is available.
        #       You should compute gradient with w.r.t the input of the mish :
        #       Input: 
        #           grad_output : the gradient of the loss function w.r.t the "output" of the mish function
        #       Variable to implement:
        #           grad_input  : the gradient of the loss function w.r.t the "input" of the mish :
        # Hint: using functions provided by pytorch: torch.exp, and using the intermediate result tanh_
        ##########################################################################################
        ################################################### TODO C_2
        grad_input = 0
        ###################################################
        return grad_input    
    
class SwishFunction(torch.autograd.Function):

    @staticmethod
    def forward(ctx, x, beta):
        ##########################################################################################
        # TODO: implement the forward function for swish
        #       You should compute the intermediate result of the swish function and the output:
        #       Input: 
        #           x: the input variable to the function
        #       beta:  the learnable parameter of the swish
        #       Variable to implement:
        #           beta_sigmoid : compute the sigmoid(beta*x)
        #           output: the output of the swish function, using the intermediate result beta_sigmoid
        # Hint: using functions provided by pytorch: torch.exp or torch.sigmoid
        #       We will use the intermediate result to reduce computation loads for the backward function
        ##########################################################################################
        ################################################### TODO C_3
        beta_sigmoid = 0
        output = 0
        ################################################### 
        ctx.save_for_backward(x, beta, beta_sigmoid)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        # The beta in swish function is learnable. Therefore you have to also compute the gradients w.r.t beta
        x, beta, beta_sigmoid = ctx.saved_tensors
        ##########################################################################################
        # TODO: implement the backward function for swish
        #       The original input x, learnable parameter beta and the intermediate result beta_sigmoid is available.
        #       You should compute gradient with w.r.t the input and the learnable parameter of the mish :
        #       Input: 
        #           grad_output : the gradient of the loss function w.r.t the "output" of the swish function
        #       Variable to implement:
        #           grad_input  : the gradient of the loss function w.r.t the "input" of the swish :
        #           grad_beta   : the gradient of the loss function w.r.t the parameter beta:
        ##########################################################################################
        ################################################### TODO C_4
        grad_input = 0
        grad_beta  = 0
        ################################################### TODO
        return grad_input, grad_beta
    
sigmoid = SigmoidFunction.apply
mish = MishFunction.apply
swish = SwishFunction.apply


class Sigmoid(torch.nn.Module):
    
    def __init__(self):
        super(MySigmoid, self).__init__()

    def forward(self, input):
        return sigmoid(input)

class Mish(torch.nn.Module):
    
    def __init__(self):
        super(Mish, self).__init__()

    def forward(self, input):
        return mish(input)
    
class Swish(torch.nn.Module):
    
    def __init__(self):
        super(Swish, self).__init__()
        self.beta = torch.nn.Parameter(torch.Tensor(1))
        self.beta.data.uniform_(-0.1, 0.1)
        
    def forward(self, input):
        return swish(input, self.beta)

input = (torch.randn(50,50,dtype=torch.double,requires_grad=True))
test_result = gradcheck(sigmoid, input, eps=1e-6, atol=1e-4)
print("Gradient check for sigmoid function: ", test_result)
input = (torch.randn(50,50,dtype=torch.double,requires_grad=True))
test_result = gradcheck(mish, input, eps=1e-6, atol=1e-4)
print("Gradient check for mish function: ", test_result)
input = (torch.randn(50,50,dtype=torch.double,requires_grad=True), torch.randn(50,50,dtype=torch.double,requires_grad=True))
test_result = gradcheck(swish, input, eps=1e-6, atol=1e-4)
print("Gradient check for swish function: ", test_result)

### (d) Training neural networks using different activation function



In [None]:
D_in = mnist_trainX.shape[1]
D_out = 10

def training(activation_module, D_hidden_in, D_hidden_out, x, y, learning_rate, epochs):
    
    model = torch.nn.Sequential(
        torch.nn.Linear(D_in, D_hidden_in),
        activation_module(),
        torch.nn.Linear(D_hidden_in, D_hidden_out),
        activation_module(),
        torch.nn.Linear(D_hidden_out, D_out),
    )
    
    loss_train_his = []
    loss_valid_his = []
    acc_train_his = []
    acc_valid_his = []

    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    for i in range(epochs):
        
        if i%10 == 9:
            with torch.no_grad():
                model.eval()
                loss_valid, acc_valid = test(model, X_valid, Y_valid)
                loss_valid_his.append(loss_valid)
                acc_valid_his.append(acc_valid)
                loss_train, acc_train = test(model, x, y)
                loss_train_his.append(loss_train)
                acc_train_his.append(acc_train)
                print(i, "loss = ", loss_train, "acc = ", acc_train, "valid: ", (loss_valid, acc_valid))
        model.train()
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print("Training complete")
    return model, loss_train_his, loss_valid_his, acc_train_his, acc_valid_his

def test(model, X_test, Y_test):
    
    y_pred = model(X_test)
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(y_pred, Y_test)
    _, label = torch.max(y_pred, 1)
    return loss.item(), (torch.count_nonzero(label == Y_test)/ Y_test.shape[0]).item()


Using the above training function, train the classifier using different activation function including ReLU, Sigmoid, LeakyReLU, and Tanh, Swish, and Mish.

Record the returned losses and accuracy in the array losses and accs.

First, using the same learning rate 0.005 for all activation function

In [None]:
epochs = 1000
learning_rate = 5e-3
activation_units = [torch.nn.ReLU, torch.nn.Sigmoid, torch.nn.LeakyReLU, torch.nn.Tanh, Mish, Swish]
activation_strs = ["ReLU", "Sigmoid", "LeakyReLU", "Tanh",  "Mish", "Swish"]
learning_rate = [5e-3, 5e-3, 5e-3, 5e-3, 5e-3, 5e-3]
models = []
losses = np.zeros((len(activation_units), 2, int(epochs/10)))
accs = np.zeros((len(activation_units), 2, int(epochs/10)))


for i, act in enumerate(activation_units):
    print("Training with {} activation units".format(activation_strs[i]))
# Call the training function to get the model, loss and accuracy
    model, loss_train_his, loss_valid_his, acc_train_his, acc_valid_his = training(act, 
                200, 64, X_train, Y_train, learning_rate[i], epochs)
    models.append(model)
    losses[i,0,:] = np.array(loss_train_his)
    losses[i,1,:] = np.array(loss_valid_his)
    accs[i,0,:] = np.array(acc_train_his)
    accs[i,1,:] = np.array(acc_valid_his)

Visualize the change of loss and accuracy versus epochs.

You should have four plots (1) training loss (2) validation loss (3) training accuracy (4) validation accuracy

Observe the loss and accuracy.

**_Your Observation:_**


In [None]:
x_range = range(9, epochs, 10)
for i, act in enumerate(activation_units):
    plt.plot(x_range, losses[i,0], label = activation_strs[i])
plt.xlabel("Epochs")
plt.ylabel("Training Losses")
plt.yscale("log")
plt.legend()
plt.show()

for i, act in enumerate(activation_units):
    plt.plot(x_range, losses[i,1], label = activation_strs[i])
plt.xlabel("Epochs")
plt.ylabel("Validation Losses")
plt.yscale("log")
plt.legend()
plt.show()

for i, act in enumerate(activation_units):
    plt.plot(x_range, accs[i,0], label = activation_strs[i])
plt.xlabel("Epochs")
plt.ylabel("Training Accuracy")
plt.yscale("log")
plt.legend()
plt.show()

for i, act in enumerate(activation_units):
    plt.plot(x_range, accs[i,1], label = activation_strs[i])
plt.xlabel("Epochs")
plt.ylabel("Validation Accuracy")
plt.yscale("log")
plt.legend()
plt.show()

### **(e) Grid search of learning rate for different activation functions**
You should have noticed that some activation units did not converge.

This means that the learning rate may be too low for those activation units.

Adjust the learning rates for those activation units, and try to optimize the learning rate for each activation unit.

In [None]:
num_lr = 8
epochs = 1000
losses = np.zeros((len(activation_units), 2, num_lr, int(epochs/10)))
accs = np.zeros((len(activation_units), 2, num_lr, int(epochs/10)))
#############################################################################################################
# TODO: set the learning_rates array so that we can search through different learning rates for each activation functions
# Example: by setting learning_rates = [0.1, 0.2]*len(activation_units), the below code will train
#          using learning rates 0.1 and 0.2 for each activation function neural network. 
#############################################################################################################

###################################################### TODO E
learning_rates = []*len(activation_units)
######################################################
for i, act in enumerate(activation_units):
    for j, lr in enumerate(learning_rates[i]):
        print("Training with {} activation units at learning rate {}".format(activation_strs[i], lr))

    # Call the training function to get the model, loss and accuracy
        model, loss_train_his, loss_valid_his, acc_train_his, acc_valid_his = training(act, 
                    200, 64, X_train, Y_train, lr, epochs)
        models.append(model)
        losses[i,0,j,:] = np.array(loss_train_his)
        losses[i,1,j,:] = np.array(loss_valid_his)
        accs[i,0,j,:] = np.array(acc_train_his)
        accs[i,1,j,:] = np.array(acc_valid_his)

Observe which learning rate is best for each activation function.
Then compare these activation function result.

This may take about one hours to train all models if you use CPU training.

In [None]:
x_range = range(9, epochs, 10)
for i, act in enumerate(activation_units):
    for j, lr in enumerate(learning_rates[i]):
        plt.plot(x_range, np.clip(losses[i,0,j], a_min = None, a_max = 100), label = "lr = {:1.4f}".format(lr))
    plt.xlabel("Epochs")
    plt.ylabel("Training Losses")
    #plt.ylim(0, 5)
    plt.yscale("log")
    plt.legend()
    plt.title(activation_strs[i])
    plt.show()

for i, act in enumerate(activation_units):
    for j, lr in enumerate(learning_rates[i]):
        plt.plot(x_range, np.clip(losses[i,1,j], a_min = None, a_max = 100), label = "lr = {:1.4f}".format(lr))
    plt.xlabel("Epochs")
    plt.ylabel("Validation Losses")
    #plt.ylim(0, 5)
    plt.yscale("log")
    plt.legend()
    plt.title(activation_strs[i])
    plt.show()

for i, act in enumerate(activation_units):
    for j, lr in enumerate(learning_rates[i]):
        plt.plot(x_range, accs[i,0,j], label = "lr = {:1.4f}".format(lr))
    plt.xlabel("Epochs")
    plt.ylabel("Training Accuracy")
    #plt.ylim(0, 5)
    plt.yscale("log")
    plt.legend()
    plt.title(activation_strs[i])
    plt.show()

for i, act in enumerate(activation_units):
    for j, lr in enumerate(learning_rates[i]):
        plt.plot(x_range, accs[i,1,j], label = "lr = {:1.4f}".format(lr))
    plt.xlabel("Epochs")
    plt.ylabel("Validation Accuracy")
    #plt.ylim(0, 5)
    plt.yscale("log")
    plt.legend()
    plt.title(activation_strs[i])
    plt.show()

### **(f) Training using the best learning rate**
Using the best learning rate to train on the complete training set.

Compare the loss and accuracy during training.

Then report the test accuracy for each activation. 

In [None]:
X_trainall = torch.Tensor(mnist_subsetX)
Y_trainall = torch.from_numpy(mnist_subsetY)

epochs = 1000

models = []
losses = np.zeros((len(activation_units), 2, int(epochs/10)))
accs = np.zeros((len(activation_units), 2, int(epochs/10)))
#############################################################################################################
# TODO: set the below variable "learning rate" to a list that contains the best learning rate for each activation function.
#############################################################################################################

###################################################### TODO F
learning_rate = [0, 0, 0, 0, 0, 0]
######################################################



In [None]:
for i, act in enumerate(activation_units):
    print("Training with {} activation units".format(activation_strs[i]))
    model, loss_train_his, loss_valid_his, acc_train_his, acc_valid_his = training(act, 
                200, 64, X_trainall, Y_trainall, learning_rate[i], epochs)
    models.append(model)
    losses[i,0,:] = np.array(loss_train_his)
    losses[i,1,:] = np.array(loss_valid_his)
    accs[i,0,:] = np.array(acc_train_his)
    accs[i,1,:] = np.array(acc_valid_his)

In [None]:
x_range = range(9, epochs, 10)
for i, act in enumerate(activation_units):
    plt.plot(x_range, losses[i,0], label = activation_strs[i])
plt.xlabel("Epochs")
plt.ylabel("Training Losses")
plt.yscale("log")
plt.legend()
plt.show()

for i, act in enumerate(activation_units):
    plt.plot(x_range, accs[i,0], label = activation_strs[i])
plt.xlabel("Epochs")
plt.ylabel("Training Accuracy")
#plt.yscale("log")
plt.legend()
plt.show()


In [None]:
#############################################################################################################
# TODO: Report the accuracy on test set, using the test function defined in part (d) to 
#       get the test accuracy of the model.
#       The function test returns the loss and the accuracy
#       Save the accuracy in the variable "acc"
#############################################################################################################
for i, model in enumerate(models):
    
    ###################################################### TODO
    acc = 0
    ###################################################### TODO
    print("Accuracy of using {} activation units at learning rate {:1.4f}: {:1.4f}".format(activation_strs[i].rjust(9), learning_rate[i], acc))


### **(g) Visualize the learned swish activation unit**
Get the learned parameters of the swish activation unit.
Visualize the activation unit.

Fill the missing code to use matplotlib to plot the learned swish activation.
We have computed the function input x and the output y (stored in "swish_data")

In [None]:
beta1 = list(models[5].parameters())[2].data
beta2 = list(models[5].parameters())[5].data

In [None]:
x = np.linspace(-100, 100, 100)

swish_data = swish(torch.Tensor(x), beta1).detach().numpy()
#############################################################################################################
# TODO: Fill the missing code to plot the swish activation function using "x" and "swish_data"
#############################################################################################################
##################################################TODO G
# plot the first swish function learned



##################################################

swish_data = swish(torch.Tensor(x), beta2).detach().numpy()

##################################################TODO G
# plot the second swish function learned



##################################################