## Adam Optimizer

In [29]:
# Add custom gradient calculation
# Make it object oriented
# amsgrad -> Done
# maximize -> Done
# Jitter -> Done
import typing
import numpy as np
def AdamOptim(theta0,
              objective_function_grad, 
              lr : float = 0.001,
              m0 : float = 0.0, 
              v0 : float = 0.0,  
              betta1 : float = 0.9, 
              betta2 : float = 0.999,  
              epochs : int = 1000,
              amsgrad = True,
              vtHatmax : float = 0, 
              maximize = False):
    """
    Requirement to start the algorithm:
    lr: Step size (Learning rate)
    Betta1 and betta2 -> Exponential decay rates for the moment estimates (usually in [0,1))
    f(theta): Stochastic objective function with parameter theta
    # Initializations variables:
    theta0 -> Initial parameter vector
    m0 -> Initialize first moment vector -> 0
    v0 -> Initialize second moment vector -> 0
    t -> Initialize timestep -> 0
    epochs -> number of iterations
    """
    # Start Algorithm
    # Goal --> This is one dimentional (I have to apply it to higher dimentions)
    jitter = 1e-8
    theta = np.zeros((epochs, 1))   # The init vector of theta
    gt = np.zeros((epochs, 1))      # The init vector of gradient
    mt = np.zeros((epochs, 1))      # The init vector of first moment
    vt = np.zeros((epochs, 1))      # The init vector of second moment
    mtHat = np.zeros((epochs, 1))   # The init vector of modified first moment
    vtHat = np.zeros((epochs, 1))   # The init vector of modified second moment
    # Initialize the model
    theta[0] = theta0
    gt[0] = -objective_function_grad(theta[0]) if maximize else objective_function_grad(theta[0])
    mt[0] = m0
    vt[0] = v0
    for t in range(1, epochs):
        mt[t] = betta1 * mt[t-1] + (1 - betta1) * gt[t-1] # Update biased first moment estimate
        vt[t] = betta2 * vt[t-1] + (1 - betta2) * np.dot(gt[t-1], gt[t-1]) # Update biased second raw moment estimate
        mtHat[t] = mt[t] / (1 - betta1) # Compute bias-corrected first moment estimate
        vtHat[t] = vt[t] / (1 - betta2) # Compute bias-corrected second raw moment estimate
        # Start to update the input vector
        if amsgrad:
            vtHatmax = max(vtHat[t], vtHatmax)
            theta[t] = theta[t-1] - lr * (mtHat[t]/(np.sqrt(vtHatmax) + jitter))
        else:
            theta[t] = theta[t-1] - lr * (mtHat[t]/(np.sqrt(vtHat[t]) + jitter))
        # Update the gradient function
        gt[t] = -objective_function_grad(theta[t]) if maximize else objective_function_grad(theta[t])
    return theta

In [30]:
def objective(x):
    return x**2

def gradobjective(x):
    return 2*x

theta = AdamOptim(3, gradobjective)
# Here, I can show it on 

In [31]:
theta

array([[3.        ],
       [2.999     ],
       [2.99765617],
       [2.99609081],
       [2.99437012],
       [2.99253709],
       [2.9906221 ],
       [2.98864773],
       [2.98663121],
       [2.984586  ],
       [2.98252273],
       [2.9804499 ],
       [2.97837431],
       [2.97630143],
       [2.97423568],
       [2.97218059],
       [2.97013902],
       [2.96811321],
       [2.96610496],
       [2.96411565],
       [2.96214633],
       [2.96019781],
       [2.95827064],
       [2.95636518],
       [2.95448167],
       [2.95262018],
       [2.95078071],
       [2.94896313],
       [2.94716727],
       [2.94539289],
       [2.94363972],
       [2.94190741],
       [2.94019562],
       [2.93850398],
       [2.93683209],
       [2.93517955],
       [2.93354594],
       [2.93193085],
       [2.93033385],
       [2.92875453],
       [2.92719247],
       [2.92564727],
       [2.9241185 ],
       [2.92260578],
       [2.92110871],
       [2.91962691],
       [2.91816   ],
       [2.916

## AdaGrad

In [51]:
# Etta
# Delta
# Initial Point
# Grad
# Epochs
# This is AdaGrad with diagonal matrices
def AdaGrad(theta0,
            objective_function_grad, 
            lr : float = 0.001,
            etta : float = 0.0, 
            lanbda : float = 0.0,  
            epochs : int = 1000,
            jitter : float = 1e-8,
            maximize = False):
    theta = np.zeros((epochs, 1))   # The init vector of theta
    gt = np.zeros((epochs, 1))      # The init vector of gradient
    state_sum = np.zeros((epochs, 1))
    # Initialize the model
    theta[0] = theta0
    gt[0] = -objective_function_grad(theta[0]) if maximize else objective_function_grad(theta[0])
    lrHat = 0
    for t in range(1, epochs):
        # Modify learning rate to capture the decay
        lrHat += lr / (1 + (t-1) * etta)
        # Find the grad
        
        # Modify the gradient
        if lanbda != 0:
            gt[t] = gt[t] + lanbda * theta[t]

        state_sum[t] += np.dot(gt[t-1], gt[t-1])
        theta[t] = theta[t-1] - lrHat * gt[t-1] / (np.sqrt(state_sum[t]) + jitter)
        gt[t] = -objective_function_grad(theta[t]) if maximize else objective_function_grad(theta[t])
    return theta

In [52]:
theta = AdaGrad(3, gradobjective, epochs=10)
print(theta)

corrector=  [1.e-08]
corrector=  [6.00000001]
corrector=  [5.99800001]
corrector=  [5.99400001]
corrector=  [5.98800001]
corrector=  [5.98000001]
corrector=  [5.97000001]
corrector=  [5.95800001]
corrector=  [5.94400001]
[[3.   ]
 [2.999]
 [2.997]
 [2.994]
 [2.99 ]
 [2.985]
 [2.979]
 [2.972]
 [2.964]
 [2.955]]


## RMSProp

## A basic example

## Develope a DNN model

## Results