In [1]:
import numpy as np
import sys

### activation functions

In [2]:
def ReLU(z):
    return np.maximum(0, z)
def d_ReLU(z):
    return np.where(z < 0, 0.0, 1.0)

def sigmoid(z):
    return 1/(1 + np.exp(-z))
def d_sigmoid(z):
    return sigmoid(z)*(1 - sigmoid(z))

def tanh(z):
    # x = (np.exp(z) + np.exp(-z))/(np.exp(z) - np.exp(-z))
    return np.tanh(z)
def d_tanh(z):
    return 1 - tanh(z)**2

### trainning examples 

In [3]:
inputs = 5
data = 50

X = (np.random.random((inputs, data)) - 0.5) * 2
some_rates1 = np.random.randint(-1, 2, size = (1, inputs))
Y = np.round(sigmoid(np.dot(some_rates1, X))) # shape: (1, data)


### init

In [4]:
def cost(Y_hat):
    epsilon = 0
    if(np.any(Y_hat == 1) or np.any(Y_hat == 0)):
        epsilon = 1e-5
    return np.sum(-(Y*np.log(Y_hat + epsilon) + (1 - Y)*np.log(1 - Y_hat + epsilon))) / data
def d_cost(Y_hat):
    epsilon = 0
    if(np.any(Y_hat == 1) or np.any(Y_hat == 0)):
        epsilon = 1e-5
    return -(Y/(Y_hat + epsilon)) + ((1-Y)/(1-Y_hat + epsilon))

layers = [inputs, 5, 5, 5, 5, 1]
layer = len(layers) - 1

A = [X]
dA = [None]
W = [None]
dW = [None]
B = [None]
dB = [None]
Z = [None]
dZ = [None]
g = [None]
dg = [None]

for i in range(1, layer):
    if(i % 2 == 0):
        g.append(tanh)
        dg.append(d_tanh)
    else:
        g.append(sigmoid)
        dg.append(d_sigmoid)
g.append(sigmoid)
dg.append(d_sigmoid)

Y_hat = np.zeros((1, data))
for l in range(1, layer + 1):
    A.append(np.zeros((layers[l], data)))
    dA.append(np.zeros((layers[l], data)))
    W.append((np.random.random((layers[l], layers[l-1])) - 0.5) * 2)
    dW.append(np.zeros((layers[l], layers[l-1])))
    B.append(np.zeros((layers[l], 1)))
    dB.append(np.zeros((layers[l], 1)))
    Z.append(np.zeros((layers[l], data)))
    dZ.append(np.zeros((layers[l], data)))

### learning

In [5]:
from IPython.display import display, update_display

iteration = 100000
rate = 0.01
L = cost(Y_hat)

'''
display(display_id=1)
display(L, display_id=2)

update_display(str(int(acc / iteration * 100)) + "%", display_id=1)
update_display(L, display_id=2)
'''


for i in range(1, iteration + 1):
    for l in range(1, layer + 1):
        Z[l] = np.dot(W[l], A[l - 1]) + B[l]
        A[l] = g[l](Z[l])
    Y_hat = A[layer]
    L = cost(Y_hat)
    
    dA[layer] = d_cost(Y_hat)
    for l in range(layer, 0, -1):
        dZ[l] = dA[l] * dg[l](Z[l])# dA[l+1] / dZ[l]
        dW[l] = np.dot(dZ[l], A[l-1].T) / data # dW[l] : (l, l-1)
        dB[l] = np.sum(dZ[l], axis=1, keepdims=True) / data # dB[l] : (l, 1)
        dA[l-1] = np.dot(W[l].T, dZ[l])
        
        W[l] = W[l] - dW[l] * rate
        B[l] = B[l] - dB[l] * rate
    
    if((int(i / iteration * 100)) % 20 == 0 and int(i * 100 / iteration) == i * 100 / iteration):
        sys.stdout.write(str(int(i / iteration * 100)) + "%\nloss: " + str(L) + "\n\n")

20%
loss: 0.0018853569761551714

40%
loss: 0.0007322729560100648

60%
loss: 0.00044379726652914354

80%
loss: 0.00031602572995915584

100%
loss: 0.0002445703587029345



[https://github.com/kuk/log-progress]() <br>
[https://www.v7labs.com/blog/neural-networks-activation-functions]() <br>
[https://paperswithcode.com/method/gelu]()