In [1]:
import numpy as np
import sys

### activation functions

In [2]:
def ReLU(z):
    return np.maximum(0, z)
def d_ReLU(z):
    return np.where(z < 0, 0.0, 1.0)

def sigmoid(z):
    return 1/(1 + np.exp(-z))
def d_sigmoid(z):
    return sigmoid(z)*(1 - sigmoid(z))

def tanh(z):
    # x = (np.exp(z) + np.exp(-z))/(np.exp(z) - np.exp(-z))
    return np.tanh(z)
def d_tanh(z):
    return 1 - tanh(z)**2

### trainning examples 

In [3]:
inputs = 5

data = 50
m = data

X = (np.random.random((inputs, data)) - 0.5) * 2
some_rates1 = np.random.randint(-1, 2, size = (1, inputs))
Y = np.round(sigmoid(np.dot(some_rates1, X))) # shape: (1, data)


### init

In [4]:
def cost(Y_hat):
    epsilon = 0
    if(np.any(Y_hat == 1) or np.any(Y_hat == 0)):
        epsilon = 1e-5
    return np.sum(-(Y*np.log(Y_hat + epsilon) + (1 - Y)*np.log(1 - Y_hat + epsilon))) / m
def d_cost(Y_hat):
    epsilon = 0
    if(np.any(Y_hat == 1) or np.any(Y_hat == 0)):
        epsilon = 1e-5
    return -(Y/(Y_hat + epsilon)) + ((1-Y)/(1-Y_hat + epsilon))

layers = [inputs, 5, 5, 5, 5, 1]
layer = len(layers) - 1

A = [X]
dA = [None]
W = [None]
dW = [None]
B = [None]
dB = [None]
Z = [None]
dZ = [None]
g = [None]
dg = [None]

for i in range(1, layer):
    g.append(tanh)
    dg.append(d_tanh)
        
g.append(sigmoid)
dg.append(d_sigmoid)

Y_hat = np.zeros((1, data))
for l in range(1, layer + 1):
    A.append(np.zeros((layers[l], data)))
    dA.append(np.zeros((layers[l], data)))
    
    Z.append(np.zeros((layers[l], data)))
    dZ.append(np.zeros((layers[l], data)))

### Normalizing

In [5]:
# subtract mean
mu = np.sum(A[0]) / m
A[0] = A[0] - mu

# Normalize Variance
sigma = np.sum(A[0]**2)/m
A[0] /= sigma

for l in range(1, layer + 1):
    # weight = weight * Var(W)
    # ReLU: Var(W) = sqrt(2 / n[l-1])
    # tanh: Var(W) = sqrt(1 / n[l-1]) 
    #             or sqrt(2 / (n[l-1] + n[l]))
    W.append(np.random.random((layers[l], layers[l-1]))*np.sqrt(2/layers[l-1]))
    dW.append(np.zeros((layers[l], layers[l-1])))
    
    B.append(np.zeros((layers[l], 1)))
    dB.append(np.zeros((layers[l], 1)))

### learning

![](https://i.imgur.com/wq7rA2W.png)

In [6]:
iteration = 1000
rate = 0.01
lambd = 0.5
L = cost(Y_hat)

for i in range(1, iteration + 1):
    for l in range(1, layer + 1):
        Z[l] = np.dot(W[l], A[l - 1]) + B[l]
        A[l] = g[l](Z[l])
    Y_hat = A[layer]
    L = cost(Y_hat)
    # Regularized cost
    J = L
    for l in range(1, layer + 1):
        J += lambd / (2*m) * np.sum(W[l]**2)
    
    dA[layer] = d_cost(Y_hat)
    for l in range(layer, 0, -1):
        dZ[l] = dA[l] * dg[l](Z[l])
        dW[l] = np.dot(dZ[l], A[l-1].T) / data
        #Regularized derivative
        dW[l] += lambd / m * W[l]
        dB[l] = np.sum(dZ[l], axis=1, keepdims=True) / data
        dA[l-1] = np.dot(W[l].T, dZ[l])
        
        W[l] = W[l] - dW[l] * rate
        B[l] = B[l] - dB[l] * rate
    
    if((int(i / iteration * 100)) % 20 == 0 and int(i * 100 / iteration) == i * 100 / iteration):
        sys.stdout.write(str(int(i / iteration * 100)) + "%\nloss: " + str(L) + ", " + str(J) + "\n\n")

20%
loss: 0.5713616197425313, 0.6392865155464086

40%
loss: 0.4817888215464378, 0.5495284435698535

60%
loss: 0.31843754126873586, 0.3912156466607578

80%
loss: 0.1845802064873525, 0.26716059024586225

100%
loss: 0.11816275312876444, 0.20969365048035182

