## Measuring Catastrophic Forgetting in Multi-layer Perceptron

**References:** \\
[1] Lopez-Paz, D., et al. Gradient episodic memory for continual learning. In Advances in NeuralInformation Processing Systems (2017), pp. 6470–6479. \\
[2] Ororbia, A., Mali, A., Kifer, D., and Giles, C. L. Lifelong neural predictive coding:  Sparsityyields less forgetting when learning cumulatively. CoRR abs/1905.10696 (2019)

In [None]:
import numpy as np
import os
import sys
import tensorflow as tf

import time
tf.enable_eager_execution()
tf.executing_eagerly()


#Unique Seed Definition
tf.set_random_seed(11)
np.random.seed(11)


# Define class to build mlp model
class MLP(object):
    def __init__(self, size_input, size_hidden, size_output, device=None):
        """
        size_input: int, size of input layer
        size_hidden: int, size of hidden layer
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None. If None, the device to be used will be decided automatically during Eager Execution
        """
        self.size_input, self.size_hidden, self.size_output, self.device =\
        size_input, size_hidden, size_output, device
    
        # 2 layers
        # Initialize weights between input layer and hidden layer
        self.W1 = tf.Variable(tf.random_normal([self.size_input, self.size_hidden],stddev=0.1),name="W1")
        # Initialize biases for hidden layer
        self.b1 = tf.Variable(tf.zeros([1, self.size_hidden]), name = "b1")
        # Initialize weights between hidden layer and output layer
        self.W2 = tf.Variable(tf.random_normal([self.size_hidden, self.size_output],stddev=0.1),name="W2")
        # Initialize biases for output layer
        self.b2 = tf.Variable(tf.random_normal([1, self.size_output]),name="b2")
            
        # Define variables to be updated during backpropagation
        self.variables = [self.W1, self.b1,self.W2, self.b2]
        
        # 3 layers        
        #self.W1 = tf.Variable(tf.random_normal([self.size_input, self.size_hidden],stddev=0.1),name="W1")
        #self.b1 = tf.Variable(tf.zeros([1, self.size_hidden]), name = "b1")
        #self.W2 = tf.Variable(tf.random_normal([self.size_hidden, self.size_hidden],stddev=0.1),name="W2")
        #self.b2 = tf.Variable(tf.random_normal([1, self.size_hidden]),name="b2")
        #self.W3 = tf.Variable(tf.random_normal([self.size_hidden, self.size_output],stddev=0.1),name="W3")
        #self.b3 = tf.Variable(tf.random_normal([1, self.size_output]),name="b3")
            
        #self.variables = [self.W1, self.b1,self.W2, self.b2, self.W3, self.b3]
        
        # 4 layers        
        #self.W1 = tf.Variable(tf.random_normal([self.size_input, self.size_hidden],stddev=0.1),name="W1")
        #self.b1 = tf.Variable(tf.zeros([1, self.size_hidden]), name = "b1")
        #self.W2 = tf.Variable(tf.random_normal([self.size_hidden, self.size_hidden],stddev=0.1),name="W2")
        #self.b2 = tf.Variable(tf.random_normal([1, self.size_hidden]),name="b2")
        #self.W3 = tf.Variable(tf.random_normal([self.size_hidden, self.size_hidden],stddev=0.1),name="W3")
        #self.b3 = tf.Variable(tf.random_normal([1, self.size_hidden]),name="b3")
        #self.W4 = tf.Variable(tf.random_normal([self.size_hidden, self.size_output],stddev=0.1),name="W4")
        #self.b4 = tf.Variable(tf.random_normal([1, self.size_output]),name="b4")
            
        #self.variables = [self.W1, self.b1,self.W2, self.b2, self.W3, self.b3, self.W4, self.b4]
        
    
    # prediction
    def forward(self, X):
        """
        forward pass
        X: Tensor, inputs
        """
        if self.device is not None:
            with tf.device('gpu:0' if self.device=='gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
      
        return self.y
    
    ## loss function
    def loss(self, y_pred, y_true):
        '''
        y_pred - Tensor of shape (batch_size, size_output)
        y_true - Tensor of shape (batch_size, size_output)
        '''
        y_true_tf = tf.cast(tf.reshape(y_true, (-1, self.size_output)), dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        #loss1 = tf.losses.mean_squared_error(y_true_tf,y_pred_tf)
        #loss2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_pred_tf, labels=y_true_tf))
        #return tf.losses.mean_squared_error(y_true_tf,y_pred_tf)
        return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_pred_tf, labels=y_true_tf))
        #return loss1 + loss2
        
  
    def backward(self, X_train, y_train):
        """
        backward pass
        """
        # optimizer
        # Test with SGD,Adam, RMSProp
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
        #optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        #optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
        grads = tape.gradient(current_loss, self.variables)
        optimizer.apply_gradients(zip(grads, self.variables),
                              global_step=tf.train.get_or_create_global_step())
        
        
    def compute_output(self, X):
        """
        Custom method to obtain output tensor during forward pass
        """
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        #Remember to normalize your dataset before moving forward
        # Compute values in hidden layer
        what = tf.matmul(X_tf, self.W1) + self.b1
        hhat = tf.nn.relu(what)
        what1 = tf.matmul(hhat, self.W2) + self.b2
        hhat1 = tf.nn.relu(what1)
        #what2 = tf.matmul(hhat1, self.W3) + self.b3
        #hhat2 = tf.nn.relu(what2)
        #what = tf.matmul(X_tf, self.W1) + self.b1
        #hhat = tf.nn.relu(what)
        #Dropout
        #hhat_tilda = tf.compat.v1.nn.dropout(hhat,rate=0.2)
        # Compute output
        output = tf.matmul(hhat, self.W2) + self.b2
        #output = tf.matmul(hhat1, self.W3) + self.b3
        #output = tf.matmul(hhat2, self.W4) + self.b4
        #output = tf.matmul(hhat_tilda, self.W2) + self.b2
        #Now consider two things , First look at inbuild loss functions if they work with softmax or not and then change this
        #Second add tf.Softmax(output) and then return this variable
        #print(output)
        return (output)
        #return output
        
def accuracy_function(yhat,true_y):
  correct_prediction = tf.equal(tf.argmax(yhat, 1), tf.argmax(true_y, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  return accuracy
  
# Load MNIST Dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/", one_hot = True)

size_input = 784
size_hidden = 256
size_output = 10

X_train = mnist.train.images
y_train = mnist.train.labels
X_test = mnist.test.images
y_test = mnist.test.labels

## Permuted MNIST

num_tasks_to_run = 10
num_epochs_per_task = 20

# Generate the tasks specifications as a list of random permutations of the input pixels.
task_permutation = []
for task in range(num_tasks_to_run):
	task_permutation.append(np.random.permutation(784))
  
minibatch_size = 32
learning_rate = 0.001

# Resulting task Matrix
R = np.zeros([10,10])

#train_ds = tf.data.Dataset.from_tensor_slices((X_train,y_train)).map(lambda x, y: (x, tf.cast(y, tf.float32))).batch(20)
#test_ds = tf.data.Dataset.from_tensor_slices((X_test,y_test)).map(lambda x, y: (x, tf.cast(y, tf.float32))).batch(20)

# Define metrics
T = 9

def acc(R):
  acc = 0
  for i in range(T):
    acc += R[T][i]
  return acc/T

def bwt(R):
  bwt = 0
  for i in range(T-1):
    bwt += R[T][i] - R[i][i]
  return bwt/(T - 1)

def cbwt(R):
  cbwt = []
  ans = 0
  for t in range(T):
    sum = 0
    for i in range(t+1,T):
      sum += R[T][i] - R[t][t]
      cbwt.append(sum/(T-t))
      ans += sum/(T-t)
  return ans/10
  
G = np.random.randn(10,10)
def tbwt(R,G):
  tbwt = 0
  for i in range(1,T-1):
    tbwt+= R[T][i] - G[i][i]
  return tbwt/(T-1)

# Training and Testing
model = MLP(size_input, size_hidden, size_output, device='gpu')
for train_task in range(num_tasks_to_run):
  print("Training Task: {}".format(train_task + 1))
  pmnist_train = X_train[:,task_permutation[train_task]]
  num_train = 55000
  for epoch in range(num_epochs_per_task):
    train_ds = tf.data.Dataset.from_tensor_slices((pmnist_train, y_train)).map(lambda x, y: (x, tf.cast(y, tf.float32)))\
           .shuffle(buffer_size=1000)\
           .batch(batch_size=minibatch_size)
    loss_total = tf.Variable(0, dtype=tf.float32)
    for inputs, outputs in train_ds:
      preds = model.forward(inputs)
      loss_total = loss_total + model.loss(preds, outputs)
      model.backward(inputs, outputs)
    preds_val = model.compute_output(mnist.validation.images)
    accuracy_val = accuracy_function(preds_val,mnist.validation.labels)
    accuracy_val = accuracy_val * 100
    print ("Validation Accuracy = {}".format(accuracy_val.numpy()))
  for test_task in range(num_tasks_to_run):
    pmnist_test = X_test[:,task_permutation[test_task]]
    preds_test = model.compute_output(pmnist_test)
    accuracy_test = accuracy_function(preds_test,y_test)
    R[train_task][test_task] = accuracy_test
    print('Test Accuracy on Task:{} is {:.4f}'.format(test_task+1, accuracy_test.numpy()))
    

In [None]:
print('Resulting Task Matrix R {}'.format(R))

print('Average Accuracy ACC {:.4f}'.format(acc(R)))
print('Backward Transfer BWT {:.4f}'.format(bwt(R)))
print('Cumulative Backward Transfer CBWT {:.4f}'.format(cbwt(R)))
print('True Backward Transfer TBWT {:.4f}'.format(tbwt(R,G)))