In [None]:
import numpy as np
import wandb
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , accuracy_score
from scipy.special import log_softmax,softmax
import seaborn as sns

In [None]:
class FFNN:
    def __init__(self,net_size,layer_act,init_wb='he',lr=1e-3,opt='nadam',lamda=0,batch_size=64,\
                 n_epochs=10,beta_1=0.9,beta_2=0.999,seed=None,loss='cross_ent',relu_param=0):
        
        self.net_size = net_size
        self.layer_acts = layer_act
        self.init_wb = init_wb
        self.lr = lr
        self.optim = opt
        self.lamda = lamda
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.loss = loss
        self.seed = seed
        self.relu_param=relu_param

    def onehot_encode(self,y, n_labels):
        mat = np.zeros((len(y), n_labels))
        for i, val in enumerate(y):
            mat[i, val] = 1
        return mat.T
    
    def nn_init(self, network_size, wb_init='random'):

        if self.seed is not None:
            np.random.seed(self.seed)

        params = {}

        num_layers = len(network_size)

        if wb_init == 'random':
            for layer in range(1, num_layers):
                params['weights' + str(layer)] = np.random.random((network_size[layer], network_size[layer - 1]))
                params['biases' + str(layer)] = np.random.random((network_size[layer], 1))
                
        elif wb_init == 'xavier_uniform':
            for layer in range(1, num_layers):
                r = np.sqrt(6.0 / (network_size[layer] + network_size[layer - 1]))
                params['weights' + str(layer)] = np.random.uniform(-r, r, (network_size[layer], network_size[layer - 1]))
                params['biases' + str(layer)] = np.random.uniform(-r, r, (network_size[layer], 1))
        
        else:
            raise ValueError('Invalid Activation function ...')
        return params
    
    def Linear(self,input_data,diff=False):
        input_data = np.array(input_data, dtype=np.float64)
        if diff == False:
            return input_data
        else:
            return np.ones_like(input_data)

    def ReLU(self, input_data, diff=False):
        alpha = self.relu_param
        input_data = np.array(input_data, dtype=np.float64)

        if diff == False:
            return np.where(input_data < 0, alpha * input_data, input_data)

        elif diff == True:
            output_data = np.ones_like(input_data, dtype=np.float64)
            output_data[input_data < 0] = alpha
            return output_data
    
    def ELU(self,input_data,diff=False):
        alpha = self.relu_param
        if diff == False:
            return np.where(input_data < 0, alpha * (np.exp(input_data)-1), input_data)
        else:
            output_data = np.ones_like(input_data, dtype=np.float64)
            output_data[input_data < 0] = alpha*np.exp(input_data)
            return output_data    
    
    def sigmoid(self, input_data, diff=False):
        input_data = np.where(input_data<-700,-700,input_data)
        if not diff:
            output_data = 1 / (1 + np.exp(-np.array(input_data)))
        else:
            s = 1 / (1 + np.exp(-np.array(input_data)))
            output_data = s * (1 - s)
        return output_data

    def Tanh(self, input_data, diff=False):
        # Compute the hyperbolic tangent function for input_data
        input_data = np.array(input_data)
        if not diff:
            output_data = np.tanh(input_data)
        # Compute the derivative of the hyperbolic tangent function for input_data
        else:
            output_data = 1 - np.tanh(input_data) ** 2
        return output_data
    
    def softmax(self,X):
        return log_softmax(X,axis=0)

    def forward(self,data,acts,params):
        if self.seed is not None:
            np.random.seed(self.seed)
        param_list = []
        act_out = data
        for idx, act in enumerate(acts,start=1):
            data_prev = act_out
            Wb = np.dot(params['weights'+str(idx)],data_prev)+params['biases'+str(idx)]

            if act == 'sigmoid':
                act_out = self.sigmoid(Wb)
            elif act == 'tanh':
                act_out = self.Tanh(Wb)
            elif act == 'relu':
                act_out = self.ReLU(Wb)
            elif act == 'softmax':
                act_out = self.softmax(Wb)
            elif act == 'identity':
                act_out = self.Linear(Wb)
            elif act == 'elu':
                act_out == self.ELU(Wb)
            else:
                raise ValueError('Invalid activation function ...')
            
            pl = ((data_prev,params['weights'+str(idx)],params['biases'+str(idx)]),Wb)
            param_list.append(pl)
        return act_out,param_list
    
    def grad(self,pred,target,params,lamda=0,loss='cross_ent'):
        n_class = target.shape[1]
        if loss == 'cross_ent':
            loss = -np.mean(np.multiply(pred,target),axis=1).sum()
        elif loss=='mse':
            loss = -np.mean(np.multiply(pred-target,pred-target),axis=1).sum()
        else:
            raise ValueError('Error function invalid. Please choose either "cross_ent" or "mse" ')
        param_len = len(params)//2

        sum_w = 0
        for idx in range(1,param_len):
            sum_w += np.square(params['weights'+str(idx)]).sum()
        loss += sum_w*(lamda/(2*n_class))
        return loss

    def backward(self,pred,target,param_list,acts,lamda=0,loss='cross_ent'):
        grad_tape = {}
        lpl = len(param_list)
        m,n = pred.shape
        target = target.reshape(pred.shape)
        if loss == 'cross_ent':
            dOut = np.exp(pred) - target
        elif loss == 'mse':
            dOut = 2*(np.exp(pred) - target)
        else:
            raise ValueError('Error function invalid. Please choose either "cross_ent" or "mse" ')

        pred,weight,_ = param_list[-1][0]
        grad_tape['d_weights'+str(lpl)] = np.dot(dOut,pred.T)/m
        grad_tape['d_biases'+str(lpl)] = dOut.sum(axis=1,keepdims=True)/m
        grad_tape['d_pred'+str(lpl-1)] = np.dot(weight.T,dOut)

        for idx in reversed(range(lpl-1)):
            linear_pred,out = param_list[idx]
            out_prev,weight,b = linear_pred

            m,n = out_prev.shape
            dOut_prev = grad_tape['d_pred'+str(idx+1)]

            if acts[idx] == 'relu':
                dOut = dOut_prev*self.ReLU(out,True)
            elif acts[idx] == 'sigmoid':
                dOut = dOut_prev*self.sigmoid(out,True)
            elif acts[idx] == 'tanh':
                dOut = dOut_prev*self.Tanh(out,True)
            elif acts[idx] == 'elu':
                dOut = dOut_prev*self.ELU(out,True)
            elif acts[idx] == 'identity':
                dOut = dOut_prev*self.Linear(out,True)            
            
            grad_tape['d_pred'+str(idx)] = np.dot(weight.T,dOut)
            grad_tape['d_weights'+str(idx+1)] = (np.dot(dOut,out_prev.T)+ lamda*weight)/m
            grad_tape['d_biases'+str(idx+1)] = dOut.sum(axis=1,keepdims=True)/m
        return grad_tape
        

    def optim_step(self,params,grad_tape,lr,t_step,algo='adam',opt_params=None):
        len_param = len(params)//2

        if algo == 'sgd':
            for idx in range(len_param):
                params['weights'+str(idx+1)] -= lr*grad_tape['d_weights'+str(idx+1)]
                params['biases'+str(idx+1)] -= lr*grad_tape['d_biases'+str(idx+1)]
                opt_params=None
        elif algo == 'sgdm':
            for idx in range(len_param):
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_weights'+str(idx+1)]
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_biases'+str(idx+1)]

                params['weights'+str(idx+1)] -= lr*opt_params['v_w'+str(idx+1)]
                params['biases'+str(idx+1)] -= lr*opt_params['v_w'+str(idx+1)]
        elif algo == 'nag':
            for idx in range(len_param):
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] - lr*grad_tape['d_weights'+str(idx+1)]
                opt_params['vb'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] - lr*grad_tape['d_biases'+str(idx+1)]

                params['weights'+str(idx+1)] -= self.beta_1*(opt_params['v_w'+str(idx+1)] - opt_params['v_w_prev'+str(idx+1)])
                params['biases'+str(idx+1)] -= self.beta_1*(opt_params['v_w'+str(idx+1)] - opt_params['v_w_prev'+str(idx+1)])

                opt_params['v_w_prev'+str(idx+1)] = opt_params['v_w'+str(idx+1)]
                opt_params['v_w_prev'+str(idx+1)] = opt_params['v_w'+str(idx+1)]

        elif algo == 'rmsprop':
            for idx in range(len_param):
                opt_params['m_b'+str(idx+1)] = self.beta_2*opt_params['m_b'+str(idx+1)] + (1-self.beta_2)*(grad_tape['d_biases'+str(idx+1)]**2)
                opt_params['m_w'+str(idx+1)] = self.beta_2*opt_params['m_w'+str(idx+1)] + (1-self.beta_2)*(grad_tape['d_weights'+str(idx+1)]**2)

                params['weights'+str(idx+1)] -= lr*grad_tape['d_weights'+str(idx+1)]/(np.sqrt(opt_params['m_w'+str(idx+1)])+1e-8)
                params['biases'+str(idx+1)] -= lr*grad_tape['d_biases'+str(idx+1)]/(np.sqrt(opt_params['m_b'+str(idx+1)])+1e-8)

        elif algo == 'adam':
            for idx in range(len_param):
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_biases'+str(idx+1)]
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_weights'+str(idx+1)]

                opt_params['m_b'+str(idx+1)] = self.beta_2*opt_params['m_b'+str(idx+1)] + (1-self.beta_2)*(grad_tape['d_biases'+str(idx+1)]**2)
                opt_params['m_w'+str(idx+1)] = self.beta_2*opt_params['m_w'+str(idx+1)] + (1-self.beta_2)*(grad_tape['d_weights'+str(idx+1)]**2)

                mod_lr = lr*np.sqrt((1-self.beta_2**t_step)/(1-self.beta_1**t_step+1e-8))
                params['weights'+str(idx+1)] -= mod_lr*(opt_params['v_w'+str(idx+1)]/(np.sqrt(opt_params['m_w'+str(idx+1)])+1e-8))
                params['biases'+str(idx+1)] -= mod_lr*(opt_params['v_w'+str(idx+1)]/(np.sqrt(opt_params['m_b'+str(idx+1)])+1e-8))
        elif algo =='nadam':
            for idx in range(len_param):
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_biases'+str(idx+1)]
                opt_params['v_w'+str(idx+1)] = self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_weights'+str(idx+1)]

                opt_params['m_b'+str(idx+1)] = self.beta_2*opt_params['m_b'+str(idx+1)] + (1-self.beta_2)*(grad_tape['d_biases'+str(idx+1)]**2)
                opt_params['m_w'+str(idx+1)] = self.beta_2*opt_params['m_w'+str(idx+1)] + (1-self.beta_2)*(grad_tape['d_weights'+str(idx+1)]**2)

                mod_lr = lr*np.sqrt((1-self.beta_2**t_step)/(1-self.beta_1**t_step+1e-8))
                params['weights'+str(idx+1)] -= (mod_lr/(np.sqrt(opt_params['m_w'+str(idx+1)])+1e-8))*(self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_weights'+str(idx+1)])
                params['biases'+str(idx+1)] -= (mod_lr/(np.sqrt(opt_params['m_b'+str(idx+1)])+1e-8))*(self.beta_1*opt_params['v_w'+str(idx+1)] + (1-self.beta_1)*grad_tape['d_biases'+str(idx+1)])
        return params,opt_params
    
    def predict(self,data):
        out = self.forward(data,self.layer_acts,self.params)[0]
        return np.argmax(out,axis=0),out.T
    
    def train(self,X_train,Y_train,X_val,Y_val,n_classes=10,wb_log=True):
        self.losses=[]
        opt_params = {}
        m = X_train.shape[1]
        y_train = self.onehot_encode(Y_train,n_classes)
        self.params = self.nn_init(self.net_size,self.init_wb)
        self.t_step = 1
        idx = np.arange(m)

        if self.optim != 'sgd':
            for ii in range(1,len(self.net_size)):
                opt_params['v_w'+str(ii)] = np.zeros((self.net_size[ii],self.net_size[ii-1]))
                opt_params['v_w'+str(ii)] = np.zeros((self.net_size[ii],1))

                opt_params['v_w_prev'+str(ii)] = np.zeros((self.net_size[ii],self.net_size[ii-1]))
                opt_params['v_w_prev'+str(ii)] = np.zeros((self.net_size[ii],1))

                opt_params['m_w'+str(ii)] = np.zeros((self.net_size[ii],self.net_size[ii-1]))
                opt_params['m_b'+str(ii)] = np.zeros((self.net_size[ii],1))
        
        for _ in range(self.n_epochs):
            np.random.shuffle(idx)
            X_shuffled = X_train[:,idx]
            Y_shuffled = y_train[:,idx]
            for ii in range(0,m,self.batch_size):
                # self.t_step+=1
                X_batched = X_shuffled[:,ii:ii+self.batch_size]
                Y_batched = Y_shuffled[:,ii:ii+self.batch_size]

                out,param_list = self.forward(X_batched,self.layer_acts,self.params)
                loss = self.grad(out,Y_batched,self.params,self.lamda,self.loss)
                self.losses.append(loss)
                grads = self.backward(out,Y_batched,param_list,self.layer_acts,self.lamda,self.loss)
                self.params,opt_params = self.optim_step(self.params,grads,self.lr,\
                                                            self.t_step,self.optim,opt_params)
                self.t_step+=1
            y_pred_train,_ = self.predict(X_train)
            y_pred_valid,_ = self.predict(X_val)

            train_acc = accuracy_score(Y_train,y_pred_train)
            val_acc = accuracy_score(Y_val,y_pred_valid)
            log = {'train_acc':train_acc, 'val_acc':val_acc,'train_loss':loss}
            if wb_log:
                wandb.log(log)
            else:
                print(log)

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress',
               'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [None]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

class_num = 10
num_row = 2
num_col = 5# plot images
fig, axes = plt.subplots(num_row, num_col, figsize=(1.5*num_col,2*num_row))
img_list=[]
for i in range(class_num):
  ax = axes[i//num_col, i%num_col]
  a = np.argmax(y_train == i)
  ax.imshow(x_train[a], cmap='gray')
  ax.set_title(class_names[i])
plt.tight_layout()

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=0, stratify=y_train)

x_train = x_train.reshape((len(x_train), 28*28))
x_train = x_train.astype('float32') / 255

x_valid = x_valid.reshape((len(x_valid), 28*28))
x_valid = x_valid.astype('float32') / 255

# Preprocessing test data
x_test = x_test.reshape((len(x_test), 28 * 28))
x_test = x_test.astype('float32') / 225

In [None]:
# X = x_train.T
# X_valid = x_valid.T
# Y_valid = y_valid
# Y = y_train
# n_class= 10
# n_hidden = 3
# layers = []
# hidden_size=32
# for i in range(n_hidden+2):
#     if i == 0:
#         layers.append(X.shape[0])
#     elif i == n_hidden+1:
#         layers.append(n_class)
#     else:
#         layers.append(hidden_size)
#     i = i+1
# act = []
# o_act = 'softmax'
# actvn_fn = 'tanh'
# for i in range(n_hidden+1):
#     if i == n_hidden:
#         act.append(o_act)
#     else:
#         act.append(actvn_fn)
#     i = i+1
# model = FFNN(layers,act)

In [None]:
# model.train(X,Y,X_valid,Y_valid,wb_log=False)
# model.params = model.nn_init(layers)
# y_pred,_ = model.predict(x_test.T)

In [None]:
sweep_config = {
    'method':'bayes',
    'metric':{
    'name':'val_acc',
    'goal':'maximize'
    },
    'parameters':{
    'n_epochs':{
    'values':[5,10]
    },
    'n_hidden':{
    'values':[3,4,5]
    },
    'n_hidden_units':{
    'values':[32,64,128]
    },
    'l2_coeff':{
    'values':[0,5e-4,5e-1]
    },
    'lr':{
    'values':[1e-3,1e-4]
    },
    'optim_algo':{
    'values':['sgd','sgdm','rmsprop','adam','nadam','nag']
    },
    'batch_size':{
    'values':[16,32,64]
    },
    'weight_init':{
    'values':['random','xavier_uniform']
    },
    'act_func':{
    'values':['relu','sigmoid','tanh','elu','identity']
    },
    'loss_func':{
    'values':['cross_ent','mse']
    },
    'relu_param':{
    'values':[0,1e-1,1e-2,1e-3]
    }
    }
}


In [None]:
sweep_id = wandb.sweep(sweep_config, entity="viswa_ee", project="CS6910")

In [None]:
def learn():
    config_defaults={
        'n_epochs':10,
        'n_hidden':3,
        'n_hidden_units':10,
        'l2_coeff':0,
        'lr':1e-3,
        'optim_algo':'sgd',
        'batch_size':16,
        'weights_init':'random',
        'act_func':'relu',
        'loss_func':'cross_ent',
        'relu_param':0
    }
    wandb.init(config=config_defaults)
    config = wandb.config
    X = x_train.T
    X_valid = x_valid.T
    Y_valid = y_valid
    Y = y_train
    n_class= 10
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress',
               'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    layers = []
    for i in range(config.n_hidden+2):
        if i == 0:
            layers.append(X.shape[0])
        elif i == config.n_hidden+1:
            layers.append(n_class)
        else:
            layers.append(config.n_hidden_units)
        i = i+1
    
    output_act = 'softmax'
    act_fn = config.act_func

    acts = []
    for i in range(config.n_hidden+1):
        if i == config.n_hidden:
            acts.append(output_act)
        else:
            acts.append(act_fn)
        i = i+1
    
    # wandb.run.name = str(config.optim_algo) +'_'+ str(config.act_func) + '_bs_' + str(config.batch_size)
    model = FFNN(net_size=layers,layer_act=acts,init_wb=config.weights_init,lr=config.lr,opt=config.optim_algo,\
                 lamda=config.l2_coeff,batch_size=config.batch_size,n_epochs=config.n_epochs,loss=config.loss_func,relu_param=config.relu_param)
    model.train(X,Y,X_valid,Y_valid)
    y_test_pred,_ = model.predict(x_test.T)
    cm = confusion_matrix(y_test, y_test_pred)

    # # Define class names (optional)

    # Normalize the confusion matrix (optional)
    cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    # Create a heatmap plot of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(
        cm_norm, annot=True, cmap="Blues", square=True, xticklabels=class_names, yticklabels=class_names
    )
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    # Log the confusion matrix plot to wandb
    wandb.log({"Confusion Matrix":wandb.Image(fig)})
    # log = {'conf_matrix':wandb.plot.confusion_matrix(y_true=y_test,preds=y_test_pred,class_names=class_names)}
    # wandb.log(log)
    # wandb.log({'conf_mat_'+wandb.run.name:wandb.plot.confusion_matrix(y_true=y_test,preds=y_test_pred,class_names=class_names)})
    

In [None]:
wandb.agent(sweep_id,learn,count=60)

In [1]:
import numpy as np

In [4]:
def sigmoid(input_data, diff=False):
    if not diff:
        output_data = 1 / (1 + np.exp(-np.array(input_data)))
    else:
        s = 1 / (1 + np.exp(-np.array(input_data)))
        output_data = s * (1 - s)
    return output_data

In [27]:
a = -1000*np.random.rand(10)
sigmoid(a)

  output_data = 1 / (1 + np.exp(-np.array(input_data)))


array([1.84036862e-244, 0.00000000e+000, 1.76203358e-272, 0.00000000e+000,
       1.97979520e-006, 0.00000000e+000, 2.14285538e-218, 5.90386593e-105,
       3.86717302e-065, 7.16624766e-190])

(array([-561.2207968 , -777.5664387 , -625.73667671, -949.93111217,
         -13.13251517, -872.82178577, -501.20141104, -239.99582739,
        -148.31550729, -435.52178549]),
 array([-561.2207968 , -700.        , -625.73667671, -700.        ,
         -13.13251517, -700.        , -501.20141104, -239.99582739,
        -148.31550729, -435.52178549]))