In [1]:
import numpy as np
from collections import OrderedDict

class DNN:
    def __init__(self, hidden_layers=1, hidden_units = [3], activation='relu'):
        self.weights = []
        self.biases = []
        self.As = [0 for _ in range(hidden_layers + 1)]
        self.Zs = [0 for _ in range(hidden_layers + 1)]      
        self.hidden_layers = hidden_layers
        self.hidden_units = hidden_units
        self.activation = activation
        self.n = None
        self.m = None
        self.label_num = None
    
    def one_hot_encoding(self, data_y):
        # Y is a vector: 1 x m
        d = OrderedDict()
        for i, val in enumerate(np.sort(np.unique(data_y))):
            d[val] = i
        d = dict(d)
        n = len(d)
        m = data_y.shape[0]
        one_hot = np.zeros((n, m))
        one_hot[[d[i] for i in data_y], np.arange(m)] = 1
        
        return one_hot
    
    def _weight_biases_init(self, label_num):
        np.random.seed(1)
        if self.hidden_layers != len(self.hidden_units):
            raise Exception("Number of Hidden Layers and hidden units don't match")
        
        c = self.n
        for layer in range(self.hidden_layers):
            
            r = self.hidden_units[layer]
            self.weights.append(np.random.randn(r,c) * .01)
            self.biases.append(np.zeros((r,1)))
            c = r
        
        if self.hidden_units:
            self.weights.append(np.random.rand(label_num, self.hidden_units[-1]) * .01)
        else:
            self.weights.append(np.random.rand(label_num, self.n) * .01)
        self.biases.append(np.zeros((label_num, 1)))
    
    def sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))
    def relu(self, Z):
        return np.maximum(0, Z)
    
    def activation_function(self, Z):
        if self.activation == 'sigmoid':
            return sigmoid(Z)
        elif self.activation == 'relu':
            return relu(Z)
    
    def activation_derivative(self, Z):
        if self.activation == 'sigmoid':
            return np.multiply(Z, 1-Z)
        elif self.activation == 'relu':
            temp = np.copy(Z)
            temp[temp < 0] = 0
            temp[temp > 0] = 1
            return temp
        else:
            raise Exception("Not a valid activation function")
    
    def softmax(self, Z):
        # overflow protection
        c = np.max(Z)        
        exp_a = np.exp(Z-c)
        return exp_a / np.sum(exp_a)            
        
    def cost_function(self, A, one_hot_Y):

        delta = 1e-7
        cost = -np.mean(np.multiply(one_hot_Y, np.log(A + delta))) / self.m
        return cost
    
    def forward_prop(self, A_1, layer):

        Z = np.dot(self.weights[layer], A_1) + self.biases[layer]
        self.Zs[layer] = Z
        
        A = None        
        # if the last layer has more than two nodes then use softmax
        if layer == len(self.weights) - 1:
            if self.label_num > 2:
                A = self.softmax(Z)
            else:
                A = self.sigmoid(Z)
        else:
            A = self.activation_function(Z)
        
        self.As[layer] = A
        return A
    
    def back_prop(self, dZ, layer, data_x):
        A_1 = np.transpose(self.As[layer-1])
        if layer == 0:
            A_1 = np.transpose(data_x)
            
        dW = np.dot(dZ, A_1) / self.m
        dB = np.sum(dZ, axis = 1, keepdims=True) / self.m
        dZ_1 = None
        if layer != 0 :
            dZ_1 = np.dot(np.transpose(self.weights[layer]), dZ) * self.activation_derivative(self.Zs[layer-1])

        return dZ_1, dW, dB
    
    def train(self, data_x, data_y, learning_rate = .01, epoch=100):
        
        self.n = data_x.shape[0]   # number of input features
        self.m = data_x.shape[1]   # number of input examples
        
        one_hot_Y = self.one_hot_encoding(data_y)
        self.label_num = one_hot_Y.shape[0]        
        self._weight_biases_init(one_hot_Y.shape[0])

        # feed forward
        for step in range(1, epoch):
            A = data_x
            # iterating through layers
            for layer in range(len(self.weights)):
                A = self.forward_prop(A, layer)
            
            # cost
            cost = self.cost_function(A, one_hot_Y)
            print(step, cost)
            
            # back prop
            dZ = (A - one_hot_Y)
            for layer in range(len(self.weights)-1, -1, -1):
                dZ, dW, dB = self.back_prop(dZ, layer, data_x)
                
                self.weights[layer] = (self.weights[layer] - learning_rate * dW)
                self.biases[layer] = (self.biases[layer] - learning_rate * dB)
                
    def test(self, test_x, test_y):
        
        one_hot_Y = self.one_hot_encoding(test_y)
        
        A = test_x
        for layer in range(self.hidden_layers):
            Z = np.dot(self.weights[layer], A) + self.biases[layer]
            A = self.activation_function(Z)
            
        Z = np.dot( self.weights[-1], A) + self.biases[-1]
        # no needs to run an activation function for the last layer.
        # argmax will figure it out
        print(self.sigmoid(Z))

        accuracy = np.mean(np.equal(np.argmax(Z, axis = 0)+1, test_y))
        print(accuracy)
        
