# All Optimizers From Scratch

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [72]:
# plt.rcParams["animation.html"]='jshtml'
%matplotlib notebook
# %matplotlib inline

# All Optimizers

## Moving Average

### Creating Simple data using Cos function

In [73]:
X=np.array(np.linspace(3.9,8.7,100))
Cos=np.cos(X)

In [74]:
plt.scatter(X,Cos)
plt.plot(X,Cos,c="r")

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x13e69a57d90>]

### Adding some Noise to the data 

In [75]:
Noise=Cos+np.random.rand(100)
Cos=Cos+0.5

In [77]:
plt.scatter(X,Cos,c='g')
plt.plot(X,Cos,c="r")
plt.scatter(X,Noise)
plt.plot(X,Noise,c="y")

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x13e69a5ef70>]

### As we can see the fluctuation is very high in the above dataset (" Considering it as gradient")

### We can tackle this fluctuation by taking moving average of the gradient

In [78]:
# Function to calculate moving average

def mov(x,beta):   # Beta should be between 0 and 1.(0.9 us prefered)
    new=[]
    old=0
    for i in range(len(x)):
        val=(beta*old)+(1-beta)*(x[i])
        new.append(val)
        old=val
    return np.array(new)

In [79]:
mov_avg=mov(Noise,0.7)

In [80]:
# Plotting the moving average

plt.scatter(X,Cos)
plt.plot(X,Cos,c="r")
plt.scatter(X,Noise)
plt.plot(X,mov_avg,c='y')


<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x13e6a6eb5e0>]

### After taking moving average the fluctuation has been reduced

# 1. SGD-Optimizer

In [112]:
def fx2(x):
    return 2*x + 3
  
x = np.array(np.linspace(-1, 1, 100))
y = np.array([fx2(x) for x in x])

In [113]:
def SGD(X,y,lr,iteration):
    
    start_time= timeit.default_timer()
    
    weight=0
    bias=0
    n=len(y)
    np.random.seed(20)
    W_list,b_list,cost_list=[],[],[]
    fig=plt.figure()
    ax=fig.add_subplot(111)
    for i in range(iteration):
        for j in range(len(X)):
            random_data=np.random.randint(len(X))
            YP=np.dot(X[random_data],weight)+bias
            cost=(y[random_data]-YP)**2
            dw=-2*(np.dot((y[random_data]-YP),(X[random_data])))
            dc=-2*(y[random_data]-YP)
            weight=weight-(lr*dw)
            bias=bias-(lr*dc)
            W_list.append(weight)
            b_list.append(bias)
            cost_list.append(cost)
#         ax.scatter(W_list,cost_list)
        ax.plot(W_list)
        ax.plot(b_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return weight,bias,cost

In [83]:
def predict(X,W,b):
    return X*W+b

In [114]:
SGD(x,y,0.01,30)

<IPython.core.display.Javascript object>

Time taken to execute:  3.9560821000000033  Sec


(1.9999999978696628, 3.000000000147214, 2.2538129379070827e-18)

# 2.Mini-Batch SGD

In [86]:
def fx2(x):
    return 2*x + 3
  
X = np.array(np.linspace(-1, 1, 100))
y = np.array([fx2(x) for x in x])

In [87]:
def create_Mini_batch(X,y,batch_size):
    
    if len(X.shape)<2:
        data=np.hstack((X.reshape(-1,1),y.reshape(-1,1)))
    else:
        data=np.hstack((X,y))
    random_data=np.random.shuffle(data)
    mini_batch=[]
    for i in range(0,data.shape[0],batch_size):
        mini_x=data[i:i+batch_size,:-1]
        mini_y=data[i:i+batch_size,-1]
        mini_batch.append([mini_x.flatten(),mini_y])
    return mini_batch

In [88]:
import time
def minibatch_SGD(X,y,lr,iteration,batch_size):
    
    start_time= timeit.default_timer()
    
    weight=0
    bias=0
    W_list,b_list,cost_list=[],[],[]
    fig=plt.figure()
    ax=fig.add_subplot(111)
    mini_batch=create_Mini_batch(X,y,batch_size)
    for i in range(iteration):
        
        for minibatch in mini_batch:
            n=len(minibatch[0])
            YP=np.dot(minibatch[0],weight)+bias
            cost=sum((minibatch[1]-YP)**2)
            dw=(-2/n)*(np.dot((minibatch[1]-YP),minibatch[0]))
            dc=(-2/n)*(sum((minibatch[1]-YP)))
            weight=weight-(lr*dw)
            bias=bias-(lr*dc)
            W_list.append(weight)
            b_list.append(bias)
            cost_list.append(cost)
#         ax.scatter(W_list,cost_list)
        ax.plot(W_list)
        ax.plot(b_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return weight,bias,cost
def predict(X,W,b):
    return np.dot(X,W)+b

In [111]:
minibatch_SGD(X,y,0.01,30,5)

<IPython.core.display.Javascript object>

Time taken to execute:  3.695099700000128  Sec


(1.9672768605335273, 2.999764103899676, 0.0017681413846275297)

# 3. SGD with Momentum

y_pred=predict(x,w,b)

In [90]:
def fx2(x):
    return 2*x + 3
  
x = np.linspace(-1, 1, 100)
y = [fx2(x) for x in x]

In [91]:
def SGD_Momentum(X,y,lr,iteration,beta):
    
    start_time= timeit.default_timer()
    
    weight=0
    bias=0
    n=len(y)
    np.random.seed(20)
    W_list,b_list,cost_list=[],[],[]
    fig=plt.figure()
    ax=fig.add_subplot(111)
    V_W,V_b=0,0
    for i in range(iteration):
        for j in range(n):
            random_data=np.random.randint(len(X))
            YP=(X[random_data]*weight)+bias
            cost=(y[random_data]-YP)**2
            dw=-2*((y[random_data]-YP)*(X[random_data]))
            db=-2*(y[random_data]-YP)
            V_W=(beta*V_W)+((1-beta)*dw)
            V_b=(beta*V_b)+((1-beta)*db)
            weight=weight-lr*V_W
            bias=bias-lr*V_b
            W_list.append(weight.item())
            b_list.append(bias.item())
            cost_list.append(cost)
#         ax.scatter(W_list,cost_list)
        ax.plot(W_list)
        ax.plot(b_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return  weight,bias,cost

def predict(X,W,b):
    return X*W+b


In [110]:
SGD_Momentum(x,y,0.01,30,0.9)

<IPython.core.display.Javascript object>

Time taken to execute:  3.5620283000000654  Sec


(1.999999999487017, 3.0000000000636513, 1.467472335390134e-19)

# 4.Mini_Batch With Momantum

In [93]:
def fx2(x):
    return 2*x + 3
  
x = np.array(np.linspace(-1, 1, 100))
y = np.array([fx2(x) for x in x])

In [94]:
import time

def create_Mini_batch(X,y,batch_size):
    
    if len(X.shape)<2:
        data=np.hstack((X.reshape(-1,1),y.reshape(-1,1)))
    else:
        data=np.hstack((X,y))
    random_data=np.random.shuffle(data)
    mini_batch=[]
    for i in range(0,data.shape[0],batch_size):
        mini_x=data[i:i+batch_size,:-1]
        mini_y=data[i:i+batch_size,-1]
        mini_batch.append([mini_x.flatten(),mini_y])
    return mini_batch

def minibatch_SGD_withM(X,y,lr,iteration,batch_size,beta=0.9):
    
    start_time= timeit.default_timer()
    
    if len(X.shape)<2:
        weight=0
        bias=0
    else:
        weight=np.zeros(X.shape[1])
        bias=np.zeros(0)
    
    W_list,b_list,cost_list=[],[],[]
    fig=plt.figure()
    ax=fig.add_subplot(111)
    mini_batch=create_Mini_batch(X,y,batch_size)
    V_W,V_b=0,0
    for i in range(iteration):                  
        for minibatch in mini_batch:
            
            n=len(minibatch[0])
            
            YP=minibatch[0]*weight+bias
            
            cost=sum((minibatch[1]-YP)**2)
            
            dw=(-2/n)*(sum((minibatch[1]-YP)*minibatch[0]))
            db=(-2/n)*(sum((minibatch[1]-YP)))
            
            V_W=(beta*V_W)+(1-beta)*dw
            V_b=(beta*V_b)+(1-beta)*db
            
            weight=weight-lr*V_W
            bias=bias-lr*V_b
            
            cost_list.append(cost)
            W_list.append(weight)
            b_list.append(bias)
#         ax.scatter(W_list,cost_list)
#         ax.plot(cost)
        ax.plot(b_list)
        ax.plot(W_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return weight,bias,cost

def predict(X,W,b):
    return X*W+b

In [109]:
minibatch_SGD_withM(x,y,0.01,30,10)

<IPython.core.display.Javascript object>

Time taken to execute:  3.3691555000000335  Sec


(1.76091158434418, 2.9985694878217557, 0.15816285407734465)

# 5. Adagrad Optimizer

In [96]:
def fx2(x):
    return 2*x + 3
  
x = np.array(np.linspace(-1, 1, 100))
y = np.array([fx2(x) for x in x])

In [107]:
import time

def create_Mini_batch(X,y,batch_size):
    
    if len(X.shape)<2:
        data=np.hstack((X.reshape(-1,1),y.reshape(-1,1)))
    else:
        data=np.hstack((X,y))
    random_data=np.random.shuffle(data)
    mini_batch=[]
    for i in range(0,data.shape[0],batch_size):
        mini_x=data[i:i+batch_size,:-1]
        mini_y=data[i:i+batch_size,-1]
        mini_batch.append([mini_x.flatten(),mini_y])
    return mini_batch

def Adagrad(X,y,lr,iteration,batch_size,eps=0.000000000000001):

    start_time= timeit.default_timer()
    
    if len(X.shape)<2:
        weight=0
        bias=0
    else:
        weight=np.zeros(X.shape[1])
        bias=np.zeros(0)
    
    W_list,b_list,cost_list=[],[],[]
    fig=plt.figure()
    ax=fig.add_subplot(111)
    mini_batch=create_Mini_batch(X,y,batch_size)
    G_W,G_b=0,0
    for i in range(iteration):
        for minibatch in mini_batch:
            n=len(minibatch[0])
            YP=minibatch[0]*weight+bias
            cost=sum((minibatch[1]-YP)**2)
            dw=(-2/n)*(sum((minibatch[1]-YP)*minibatch[0]))
            db=(-2/n)*(sum((minibatch[1]-YP)))
            G_W+=dw*dw
            G_b+=db*dw
            weight=weight-((lr/np.sqrt(G_W + eps))*dw)
            bias=bias-((lr/np.sqrt(G_b + eps))*db)
            cost_list.append(cost)
            W_list.append(weight)
            b_list.append(bias)
#         ax.scatter(W_list,cost_list)

# #         ax.plot(cost)
        ax.plot(W_list)
        ax.plot(b_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return weight,bias,cost
def predict(X,W,b):
    return X*W+b

In [108]:
Adagrad(x,y,0.5,30,30)

<IPython.core.display.Javascript object>

Time taken to execute:  3.261312000000089  Sec


(1.9999999973329539, 3.0000000001004183, 2.510429392566534e-17)

# 6. RMSProb Optimizer

In [99]:
def fx2(x):
    return 2*x + 3
  
x = np.array(np.linspace(-1, 1, 100))
y = np.array([fx2(x) for x in x])

In [100]:
import time

def create_Mini_batch(X,y,batch_size):
    
    if len(X.shape)<2:
        data=np.hstack((X.reshape(-1,1),y.reshape(-1,1)))
    else:
        data=np.hstack((X,y))
    random_data=np.random.shuffle(data)
    mini_batch=[]
    for i in range(0,data.shape[0],batch_size):
        mini_x=data[i:i+batch_size,:-1]
        mini_y=data[i:i+batch_size,-1]
        mini_batch.append([mini_x.flatten(),mini_y])
    return mini_batch

def RMSPROB(X,y,lr,iteration,batch_size,beta=0.95,eps=0.01):
    
    start_time= timeit.default_timer()
    if len(X.shape)<2:
        weight=0
        bias=0
    else:
        weight=np.zeros(X.shape[1])
        bias=np.zeros(0)
    
    W_list,b_list,cost_list=[],[],[]
    fig=plt.figure()
    ax=fig.add_subplot(111)
    mini_batch=create_Mini_batch(X,y,batch_size)
    G_W,G_b=0,0
    for i in range(iteration):
        for minibatch in mini_batch:
            n=len(minibatch[0])
            YP=minibatch[0]*weight+bias
            cost=sum((minibatch[1]-YP)**2)
            dw=(-2/n)*(sum((minibatch[1]-YP)*minibatch[0]))
            db=(-2/n)*(sum((minibatch[1]-YP)))
            G_W=beta*G_W + (1-beta)*(dw**2)
            G_b=beta*G_b + (1-beta)*(db**2)
            weight=weight-((lr/np.sqrt(G_W + eps))*dw)
            bias=bias-((lr/np.sqrt(G_b + eps))*db)
            cost_list.append(cost)
            W_list.append(weight)
            b_list.append(bias)
#         ax.scatter(W_list,cost_list)
#         ax.plot(cost)
        ax.plot(W_list)
        ax.plot(b_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return weight,bias,cost
def predict(X,W,b):
    return X*W+b

In [106]:
RMSPROB(x,y,0.1,30,30)

<IPython.core.display.Javascript object>

Time taken to execute:  3.4384788000002118  Sec


(2.0, 3.0, 0.0)

# 7. Adam Optimizer

In [102]:
def fx2(x):
    return 2*x + 3
  
x = np.array(np.linspace(-1, 1, 100))
y = np.array([fx2(x) for x in x])

In [103]:
import timeit
def create_Mini_batch(X,y,batch_size):
    
    if len(X.shape)<2:
        data=np.hstack((X.reshape(-1,1),y.reshape(-1,1)))
    else:
        data=np.hstack((X,y))
    random_data=np.random.shuffle(data)
    mini_batch=[]
    for i in range(0,data.shape[0],batch_size):
        mini_x=data[i:i+batch_size,:-1]
        mini_y=data[i:i+batch_size,-1]
        mini_batch.append([mini_x.flatten(),mini_y])
    return mini_batch

def Adam(X,y,lr,iteration,batch_size,beta1=0.90,beta2=0.95,eps=0.01):
    
    start_time= timeit.default_timer()
    if len(X.shape)<2:
        weight=0
        bias=0
    else:
        weight=np.zeros(X.shape[1])
        bias=np.zeros(0)
    
    W_list,b_list,cost_list=[],[],[]
    
    fig=plt.figure()
    ax=fig.add_subplot(111)
# Creating batch
    mini_batch=create_Mini_batch(X,y,batch_size)
    
# Initializing Velocity
    G_W,G_b=0,0
    V_W,V_b=0,0
    
    for i in range(iteration):
        for minibatch in mini_batch:
            n=len(minibatch[0])
            YP=minibatch[0]*weight+bias
            cost=sum((minibatch[1]-YP)**2)
            dw=(-2/n)*(sum((minibatch[1]-YP)*minibatch[0]))
            db=(-2/n)*(sum((minibatch[1]-YP)))
            
# Calculating Velocity
            V_W=(beta1*V_W)+(1-beta1)*dw
            V_b=(beta1*V_b)+(1-beta1)*db
# Bias Correction

#             V_b=V_b/(1-beta1**i+1)
#             V_W=V_W/(1-beta1**i+1)
        
# Calculating Mov avg for RMSprob
            G_W=beta2*G_W + (1-beta2)*(dw**2)
            G_b=beta2*G_b + (1-beta2)*(db**2)
        
# Bias Correction
#             G_W=G_W/(1-beta2**i+1)
#             G_b=G_b/(1-beta2**i+1)
            
            weight=weight-((lr/np.sqrt(G_W + eps))*V_W)
            bias=bias-((lr/np.sqrt(G_b + eps))*V_b)
            cost_list.append(cost)
            W_list.append(weight)
            b_list.append(bias)
#         ax.scatter(W_list,cost_list)

#         ax.plot(cost)
        ax.plot(W_list)
        ax.plot(b_list)
        fig.canvas.draw()
    End_time=timeit.default_timer()
    total=End_time-start_time
    print("Time taken to execute: ",str(total)," Sec")
    return weight,bias,cost

def predict(X,W,b):
    return X*W+b

In [105]:
Adam(x,y,0.1,30,30)

<IPython.core.display.Javascript object>

Time taken to execute:  3.880620800000088  Sec


(1.9991263597318727, 3.0090851734582436, 0.0005713016517957925)