# GD with momentum

### $v_t = \gamma * v_t + \eta \frac{dJ}{dW}$

In [1]:
import numpy as np
from matplotlib import pyplot as plt

In [2]:
lr = 0.01
epochs = 5000
#input data (XOR)
X = np.array([[0,1],[1,0],[1,1],[0,0]]) #(4,2)
y = np.array([[1],[1],[0],[0]]) #(4,1)
num_inputs = 2
num_hidden = 5
num_output = 1

In [3]:
def compute_gradient_mmnt(X,y,lr,epochs,gamma=0.9):
    #weights randomise
    Why = np.random.randn(num_hidden,num_output) #(5,1)
    Wxh = np.random.randn(num_inputs,num_hidden) #(2,5)
    
    #bias randomise
    bx = np.zeros((1,num_hidden)) #(1,5)
    by = np.zeros((1,num_output)) #(1,1)

    #activation func here
    def sigmoid(z):
        return 1/(1+np.exp(-z))
    
    def sigmoid_derivative(z):
        return np.exp(z)/((1+np.exp(z))**2)

    #forward prop step
    def forward_prop(x,Wxh,Why):
        z1 = np.dot(x,Wxh) + bx
        a1 = sigmoid(z1)
        z2 = np.dot(a1,Why) + by
        y_hat = sigmoid(z2)
        return z1,a1,z2,y_hat

    #backword prop
    def backword_prop(y_hat, z1, a1, z2):
        # dJ/d_Wxh = dJ/dy_hat . dy_hat/d_z2 . d_z2/dWyh
        # dj_dWxh = (y-y_hat)*sigmoid_derivative(z2)*a1
        delta2 = np.multiply(-(y-y_hat),sigmoid_derivative(z2))
        dJ_dWhy = np.dot(a1.T, delta2)
        
        # dJ/d_Wyh = dJ/dy_hat . dy_hat/d_z2 . d_z2/d_a1 . d_a1/d_z1 . dz1/dWxh
        # dj_dWhy = (y-y_hat)*sigmoid_derivative(z2)*W_hy*sigmoid_derivative(z1)*X
        delta1 = np.dot(delta2,Why.T)*sigmoid_derivative(z1)
        dJ_dWxh = np.dot(X.T, delta1) 
    
        return dJ_dWxh, dJ_dWhy
    vxh,vhy = 0,0

            
    def cost_function(y,y_hat):
        J = 0.5*sum((y-y_hat)**2) #there are only two different weights so n=2
        return J
    
    cost = []
    for i in range(epochs):

        #perform forward propagation and predict output
        z1,a1,z2,y_hat = forward_prop(X,Wxh,Why)
        
        #perform backward propagation and calculate gradients
        dJ_dWxh, dJ_dWhy = backword_prop(y_hat, z1, a1, z2)
        
        vxh = gamma * vxh + lr * dJ_dWxh 
        vhy = gamma * vhy + lr * dJ_dWhy
            
        #update the weights
        Wxh -= vxh
        Why -= vhy
        cost.append(cost_function(y,y_hat))
        
    return Wxh,Why,cost
        



In [4]:
Wxh,Why,cost = compute_gradient_mmnt(X,y,lr,epochs)

In [5]:
print(epochs)

5000


In [6]:
print(cost)

[array([0.66705021]), array([0.66644544]), array([0.66529626]), array([0.66365694]), array([0.66157647]), array([0.65909925]), array([0.65626571]), array([0.65311289]), array([0.64967494]), array([0.6459835]), array([0.64206812]), array([0.63795653]), array([0.63367495]), array([0.62924822]), array([0.62470009]), array([0.62005325]), array([0.61532949]), array([0.61054979]), array([0.6057343]), array([0.60090242]), array([0.59607278]), array([0.59126323]), array([0.58649079]), array([0.58177162]), array([0.57712098]), array([0.57255315]), array([0.56808137]), array([0.56371782]), array([0.55947352]), array([0.55535832]), array([0.55138083]), array([0.54754842]), array([0.54386722]), array([0.54034208]), array([0.53697661]), array([0.53377322]), array([0.53073314]), array([0.5278565]), array([0.52514236]), array([0.52258885]), array([0.52019317]), array([0.51795176]), array([0.51586036]), array([0.5139141]), array([0.51210761]), array([0.5104351]), array([0.50889047]), array([0.50746739

In [8]:
plt.grid()
plt.plot(range(epochs),cost)

[<matplotlib.lines.Line2D at 0x7f2893ae1460>]

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=748135b5-3703-429b-8a4d-909df1b7e750' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>