In [1]:
import numpy as np
import math

# Activation Functions


![test](https://www.i2tutorials.com/wp-content/uploads/2019/09/Deep-learning-25-i2tutorials.png)



In [2]:
def relu(z, deriv=False):
    activations = []
    shape = z.shape
    z = z.flatten()
    if deriv:
        for i in range(len(z)):
            if z[i] >= 0:
                activations.append(1)
            else:
                activations.append(-0.2)
        return np.array(activations).reshape(shape)
    for i in range(len(z)):
        if z[i] > 0:
            activations.append(z[i])
        else:
            activations.append(-0.2 * z[i])
    return np.array(activations).reshape(shape)

# Weight & Bias Initialization

Bias Values ($b$) are initialized with $0$.  
Weight Values ($w$) are initialized with random values between $-2$ and $2$.

In [18]:
def initialize():
    global w, b
    w = [
        np.array(np.random.randn(6)).reshape(2,3),
        np.array(np.random.randn(9)).reshape(3,3),
        np.array(np.random.randn(6)).reshape(3,2),
        np.array(np.random.randn(2)).reshape(2,1)
    ]
    b = [
        np.array(np.zeros(3)),
        np.array(np.zeros(3)),
        np.array(np.zeros(2)),
        np.array(np.zeros(1))
    ]

w, b = [], []

initialize()

for i in range(len(b)): print(f'Layer {i}:\nWeights:\n {w[i]}\nBias: \n{b[i]}\n')

Layer 0:
Weights:
 [[ 0.03027841  0.10858745 -1.85944285]
 [ 0.4870347  -0.28455876  0.83378872]]
Bias: 
[0. 0. 0.]

Layer 1:
Weights:
 [[-1.05117768  0.59218873 -1.30918413]
 [ 1.01323524 -0.05778658 -1.15917059]
 [ 0.08245821 -1.0977488  -1.00886037]]
Bias: 
[0. 0. 0.]

Layer 2:
Weights:
 [[ 0.38640244  0.49794018]
 [ 1.13288453  1.18692924]
 [-1.09148398  2.46201963]]
Bias: 
[0. 0.]

Layer 3:
Weights:
 [[2.16197737]
 [0.07538075]]
Bias: 
[0.]



# Forward Propagation
$a$ holds each layers activation vector.  
$z$ holds each layers pre nonlinearity vector.

## Algorithm

For each layer $L$, starting with $L_0$ we multiply the $h$ vector with the weight matrix $w$.

$$
w = \left[ \begin{array}{rrr}
1.3 & 0.2 \\                                              
0.1 & 1.4 \\
1.2 & 0 \\
\end{array}\right] 
h = \left( \begin{array}{rrr}
1.3 \\                                              
0.1 \\
1.2 \\
\end{array}\right)
$$

In [12]:
a, z = [], []

initialize()

def forward_prop(X):
    h = X
    global a, z
    a,z  = [], []
    for i in range(len(w)):
        h = h @ w[i] # weigt * input
        h = h + b[i] # bias add
        z.append(h)
        h = relu(h) # Activation Function
        a.append(h)
    return h

forward_prop(np.array([0,1]))


array([0.04201663])

# Back Prop

for each layer

$g = loss'(X,y)$

## Step 1 ($a$ to $z$)

$g = relu'(z)$



## Step 2 ($z$ to $W$)

$g = relu'(z) * a_{L-1}$

# Dimensions

$g = [1\times2]$

## Step 1 Activation Function Derriv

$g = [1\times2]$


In [6]:
learning_rate = 0.2

initialize()

def back_prop(X, y, print_loss=False):
    loss = 0
    global a, z, w, b
    
    g = (X - y).reshape(1,-1).T
    loss = 0.5*(y - X)**2
    
    if print_loss:
        print("Loss: ", (y - X)**2)
    
    n_weights, n_bias = [], []
    
    for x in range(len(w)):
        i = len(b) - 1 - x

        # Activation Function Derrivative [1xn]
        g = g * relu(z[i], True)  # Activation Function Derriv
        
        # Derivative with respect to weight [1xn]  
        if i-1 < 0: w_der = y.reshape(1,-1).T
        else: w_der = a[i-1].reshape(1,-1).T  # Previous Layer Activation
        
        
#         print(w_der.shape, g.shape)
#         print((w_der @ g).shape)
        
        # Change in Weights
        new_weights = w[i] - learning_rate * (w_der @ g)
        n_weights.append(new_weights)
        
        new_bias = b[i] - learning_rate * g
        n_bias.append(new_bias)
        
        g = g @ w[i].T 
    
    n_weights = list(reversed(n_weights))
    n_bias = list(reversed(n_bias))
    w = n_weights
    b = n_bias
    
    return loss
    

X = forward_prop(np.array([0,1]))
print(X)
back_prop(X, np.array([1]))

[1.50936386]


array([0.12972577])

In [11]:
from random import randint

def train(epochs, size=100):
    for i in range(epochs):
        tmp = 0.0
        for x in range(size):
            y1 = randint(0,1)
            y2 = randint(0,1)
            tmp += back_prop(forward_prop(np.array([y1,y2])), np.array([y1+y2]))[0]
        print("Loss: ", tmp/size)

initialize()
train(20, 100)
print(forward_prop(np.array([0.2,0.3])))
print(forward_prop(np.array([0.7,0.2])))
print(forward_prop(np.array([1,0])))
print(forward_prop(np.array([1,1])))

Loss:  [0.0421284]
Loss:  [0.01402559]
Loss:  [0.01092947]
Loss:  [0.00788807]
Loss:  [0.00434759]
Loss:  [0.00404377]
Loss:  [0.00295679]
Loss:  [0.00194217]
Loss:  [0.00129463]
Loss:  [0.00123925]
Loss:  [0.00078409]
Loss:  [0.00030952]
Loss:  [0.00025254]
Loss:  [0.0001971]
Loss:  [0.00011784]
Loss:  [0.00010268]
Loss:  [5.01842126e-05]
Loss:  [1.40237842e-05]
Loss:  [1.33957498e-05]
Loss:  [1.87633662e-05]
[[0.49987784]]
[[0.89843738]]
[[0.99728911]]
[[1.99931765]]


## Learning XOR

In [8]:
X = [
    [0,1],
    [0,0],
    [1,0],
    [1,1]
]

y = [1,0,1,0]

initialize()

def train(epochs, size=100):
    for i in range(epochs):
        tmp = 0.0
        for x in range(size):
            y1 = randint(0,3)
            tmp += back_prop(forward_prop(np.array(X[y1])), np.array(y[y1]))[0]
        print("Loss: ", tmp/size)
      

train(20, 100)

print(forward_prop(np.array([1,0])))
print(forward_prop(np.array([0,1])))
print(forward_prop(np.array([0,0])))
print(forward_prop(np.array([1,1])))

print(w)
print(b)

Loss:  [0.11587483]
Loss:  [0.0589459]
Loss:  [0.05006279]
Loss:  [0.05050994]
Loss:  [0.06573703]
Loss:  [0.04487583]
Loss:  [0.05829067]
Loss:  [0.14054139]
Loss:  [0.12994083]
Loss:  [0.10930429]
Loss:  [0.12711978]
Loss:  [0.10554828]
Loss:  [0.10320527]
Loss:  [0.06369206]
Loss:  [0.05432755]
Loss:  [0.05866766]
Loss:  [0.04947216]
Loss:  [0.04313845]
Loss:  [0.05004426]
Loss:  [0.03349359]
[[0.68057903]]
[[1.3514689]]
[[0.2796302]]
[[0.37055563]]
[array([[-1.05005165, -1.25493323, -2.17923395],
       [-1.89874884, -0.29496835, -0.53262792]]), array([[ 0.59964667, -1.32177852, -0.368207  ],
       [-1.02396719,  0.81586807, -0.68092178],
       [-1.22883445, -1.07504155, -3.85775008]]), array([[-0.8679731 ,  0.33597774],
       [ 1.63709093,  0.08803059],
       [ 0.94935286, -3.12693656]]), array([[ 1.01462831],
       [-3.42823431]])]
[array([[-0.9538185 ,  0.24041208,  0.57860038]]), array([[ 0.28061212, -0.11642197, -0.08085342]]), array([[-0.19291753,  1.89210103]]), array([

In [9]:
working_w_b = {
    "w": w,
    "b": b
}

print(working_w_b)

{'w': [array([[-1.05005165, -1.25493323, -2.17923395],
       [-1.89874884, -0.29496835, -0.53262792]]), array([[ 0.59964667, -1.32177852, -0.368207  ],
       [-1.02396719,  0.81586807, -0.68092178],
       [-1.22883445, -1.07504155, -3.85775008]]), array([[-0.8679731 ,  0.33597774],
       [ 1.63709093,  0.08803059],
       [ 0.94935286, -3.12693656]]), array([[ 1.01462831],
       [-3.42823431]])], 'b': [array([[-0.9538185 ,  0.24041208,  0.57860038]]), array([[ 0.28061212, -0.11642197, -0.08085342]]), array([[-0.19291753,  1.89210103]]), array([[-0.65441656]])]}


In [10]:
w = working_w_b["w"]
b = working_w_b["b"]

print(forward_prop(np.array([1,0])))
print(forward_prop(np.array([0,1])))
print(forward_prop(np.array([0,0])))
print(forward_prop(np.array([1,1])))

print(w)
print(b)

[[0.68057903]]
[[1.3514689]]
[[0.2796302]]
[[0.37055563]]
[array([[-1.05005165, -1.25493323, -2.17923395],
       [-1.89874884, -0.29496835, -0.53262792]]), array([[ 0.59964667, -1.32177852, -0.368207  ],
       [-1.02396719,  0.81586807, -0.68092178],
       [-1.22883445, -1.07504155, -3.85775008]]), array([[-0.8679731 ,  0.33597774],
       [ 1.63709093,  0.08803059],
       [ 0.94935286, -3.12693656]]), array([[ 1.01462831],
       [-3.42823431]])]
[array([[-0.9538185 ,  0.24041208,  0.57860038]]), array([[ 0.28061212, -0.11642197, -0.08085342]]), array([[-0.19291753,  1.89210103]]), array([[-0.65441656]])]
