In [1]:
import numpy as np
import sys

# Trainning Examples 

In [2]:
inputs = 5
data = 2000

m = data

X = (np.random.random((inputs, data)) - 0.5) * 2
some_rates1 = np.random.randint(-1, 2, size = (1, inputs))
Y = np.round(1/(1 + np.exp(-(np.dot(some_rates1, X))))) # shape: (1, data)

---

# Machine

### hyper parameters

In [3]:
epoch = 1000
rate = 0.01
beta = 0.9
lambd = 0.5
batch = 64 # batch size, power of 2
layers = [None, 5, 5, 5, 5, 1]
epsilon = 1e-5

---

# Init

### Activation Functions

In [4]:
def ReLU(z):
    return np.maximum(0, z)
def d_ReLU(z):
    return np.where(z < 0, 0.0, 1.0)

def sigmoid(z):
    return 1/(1 + np.exp(-z))
def d_sigmoid(z):
    return sigmoid(z)*(1 - sigmoid(z))

def tanh(z):
    # x = (np.exp(z) + np.exp(-z))/(np.exp(z) - np.exp(-z))
    return np.tanh(z)
def d_tanh(z):
    return 1 - tanh(z)**2

### Cost Function

In [5]:
def cost(Y_hat, Y):
    return np.sum(-(Y*np.log(Y_hat + epsilon) + (1 - Y)*np.log(1 - Y_hat + epsilon))) / Y.shape[1]
def d_cost(Y_hat, Y):
    return -(Y/(Y_hat + epsilon)) + ((1-Y)/(1-Y_hat + epsilon))

### Normalization

In [6]:
# subtract mean
mu = np.sum(X) / m
X = X - mu

# Normalize Variance
sigma = np.sum(X**2)/m
X /= sigma

### Mini-Batch

In [7]:
layers[0] = inputs
layer = len(layers) - 1
batch_num = int(m / batch + 1)
X = np.array_split(X, batch_num, 1)
Y = np.array_split(Y, batch_num, 1)

### Cache

In [8]:
A = [X] + [None] * layer
Z = [None] + [None] * layer
g = [None] + [ReLU] * (layer - 1) + [sigmoid]
dg = [None] + [d_ReLU] * (layer - 1) + [d_sigmoid]
W = [None]
VdW = [None]
B = [None]
VdB = [None]

# Weight Initialization
for l in range(1, layer + 1):
    # weight = weight * Var(W)
    # ReLU: Var(W) = sqrt(2 / n[l-1])
    # tanh: Var(W) = sqrt(1 / n[l-1]) 
    #             or sqrt(2 / (n[l-1] + n[l]))
    W.append(np.random.random((layers[l], layers[l-1]))*np.sqrt(2/layers[l-1]))
    VdW.append(np.zeros((layers[l], layers[l-1])))
    
    B.append(np.zeros((layers[l], 1)))
    VdB.append(np.zeros((layers[l], 1)))

---

# learning

![](https://i.imgur.com/XCW8a71.png)

In [9]:
for i in range(1, epoch + 1):
    L = 0
    for t in range(batch_num):
        A[0] = X[t]
        for l in range(1, layer + 1):
            Z[l] = np.dot(W[l], A[l - 1]) + B[l]
            A[l] = g[l](Z[l])
        Y_hat = A[layer]
        L += cost(Y_hat, Y[t])

        dA = d_cost(Y_hat, Y[t])
        for l in range(layer, 0, -1):
            dZ = dA * dg[l](Z[l])
            dW = np.dot(dZ, A[l-1].T) / data
            # Regularized derivative
            dW += lambd / m * W[l]
            dB = np.sum(dZ, axis=1, keepdims=True) / data
            
            dA = np.dot(W[l].T, dZ)
            
            # Momentum
            VdW[l] = beta * VdW[l] + (1-beta)*dW
            VdB[l] = beta * VdB[l] + (1-beta)*dB

            # usually: without bias correction
            W[l] = W[l] - VdW[l] * rate
            B[l] = B[l] - VdB[l] * rate
            
            # optional: bias correction
            '''
            W[l] = W[l] - VdW[l] / (1 - beta**(i*batch_num+t)) * rate
            B[l] = B[l] - VdB[l] / (1 - beta**(i*batch_num+t)) * rate
            '''
    
    # Regularized cost
    J = L / m
    for l in range(1, layer + 1):
        J += lambd / (2*m) * np.sum(W[l]**2)
    
    if((int(i / epoch * 100)) % 20 == 0 and int(i * 100 / epoch) == i * 100 / epoch):
        sys.stdout.write(str(int(i / epoch * 100)) + "%\nloss: " + str(J) + "\n\n")

20%
loss: 0.012252024139580375

40%
loss: 0.011270409033861731

60%
loss: 0.009796866400186881

80%
loss: 0.008212569059919449

100%
loss: 0.006999486350739348

