## Disclaimer
Implementation of gradient descent classes, using previous classes for sampling.
I tried to keep the two classes Drawing and DrawingMethod and used a structure similar to the SamplingMethods.

In [None]:
#<api>
import numpy as np
from jupyter_cms.loader import load_notebook

%matplotlib inline

In [None]:
# <api>
import os
path = os.getcwd()
s = '/'
pardir = s.join(path.split(s)[:-1])

# Load source notebooks
widget_targets = load_notebook(str(pardir + '/widgets/Widget_targets.ipynb'))
widget_methods = load_notebook(str(pardir + '/widgets/Widget_methods.ipynb'))

In [None]:
#<api>
class Target_GD(widget_targets.Target):
    '''
    Wrapper class around Target class. Required because Targets defined for 
    ascents, therefore need to inverse the gradient. 
    Additionally required to add stochastic noise, to simulate stochastic
    gradient descent.
    '''
    def __init__(self, target, stochastic=0, stochastic_std=5):
        self.target = target
        self.stochastic = stochastic
        self.stochastic_std = stochastic_std
        
    def grad(self, theta):
        return -1 * self.target.grad(theta) + self.stochastic * np.random.normal(loc=0., scale=self.stochastic_std) 

# GradientDescent

In [None]:
#<api>
class GradientDescent(widget_methods.Drawing):
    '''
    Base class for all gradient descent algorithms
    '''
    def __init__(self, target=widget_targets.MultNorm(), learning_rate=0.1, 
                 num_epochs=20, theta_start=None, stochastic=0):
        
        self.target_GD = Target_GD(target, stochastic, stochastic_std=5)
        
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        
        # Set the start point from the arguments or by default
        self.default_start = np.array([1.5,-0.05])
        if theta_start is not None:
            self.theta_start = theta_start
        else:
            self.theta_start = np.copy(self.default_start)
            
        self.epochID = 0
        # list to store updated parameters
        self.thetas=[]
        self.num_samples = 0
        self.accepted = 0
        # warning boolean, sent to widget
        self.warning = False
        
    def reset_start(self, x=None):
        '''
        reset parameter values to default values or to that 
        specified by user
        '''
        if (x is None):
            self.theta_start = self.default_start
        else:
            self.theta_start = x

    def __str__(self):
        pass
        
    def finalize_gradient_descent(self):
        '''
        remove those parameters outside of boundaries, 
        set count values, return numpy array of thetas
        '''
        
        thetas = np.array(self.thetas)
        
        # number of computed steps
        self.num_samples = thetas.shape[0]
        
        # only keep those steps that lie within the boundaries of the target distribution
        thetas = thetas[(np.max(thetas, axis=1) < self.target_GD.target.get_size())\
                                                & (np.min(thetas, axis=1) > - self.target_GD.target.get_size())]
        # number of steps to be plotted
        self.accepted = thetas.shape[0]
        
        self.thetas = thetas
          
    def perform_gradient_descent(self):
        '''iterate through samples'''
        
        theta = self.theta_start
                
        # perform updates
        for self.epochID in np.arange(self.num_epochs):
            
            self.thetas.append(theta)
            
            # update parameters
            theta = theta - self.comp_update(theta)
            
        # catch for inf thetas due to inf in gradient
        if np.any(np.isinf(self.thetas)):
            self.warning = True

        self.finalize_gradient_descent()
        
        return self.thetas, self.warning
    
    def comp_update(self, theta):
        '''
        compute individual update for one step, 
        must be implemented by each subclass
        '''
        pass

In [None]:
#<api>
class VanillaGD(GradientDescent):
    '''Vanilla gradient descent class'''

    def __str__(self):
        return "\nVanilla gradient descent has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
       
    def comp_update(self, theta):
        
        grad = self.target_GD.grad(theta)
                
        update = self.learning_rate * grad
        
        return update

In [None]:
#<api>
class MomentumGD(GradientDescent):
    '''Gradient Descent with momentum.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, learning_rate=0.1,
                num_epochs=20, theta_start=None, stochastic=0):
        
        self.gamma = gamma
        self.velocity = np.zeros_like(theta_start, dtype='float')

        super(MomentumGD, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
        
    def __str__(self):
        return "\nMomentum gradient descent has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
            
    def comp_update(self, theta):
        
        grad = self.target_GD.grad(theta)
                
        update = self.gamma * self.velocity + self.learning_rate * grad
        
        self.velocity = update
        
        return update

In [None]:
#<api>
class NesterovGD(GradientDescent):
    '''Nesterov accelerated gradient descent'''
    
    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, learning_rate=0.1,
                num_epochs=20, theta_start=None, stochastic=0):
        
        self.gamma = gamma
        self.velocity = np.zeros_like(theta_start, dtype='float')
        
        super(NesterovGD, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
    
    def __str__(self):
        return "\nNesterov gradient descent has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)
            
    def comp_update(self, theta):
        
        update = self.gamma * self.velocity + \
                    self.learning_rate * self.target_GD.grad(theta - self.gamma * self.velocity)
        
        self.velocity = update
            
        return update
        pass

In [None]:
#<api>
class ADAGRAD(GradientDescent):
    '''Adagrad'''
    
    def __init__(self, target=widget_targets.MultNorm(), epsilon=1E-8, learning_rate=0.01,
                num_epochs=20, theta_start=None, stochastic=0):

        self.epsilon = epsilon
        self.past_sq_grad = np.zeros_like(theta_start, dtype='float')
        
        super(ADAGRAD, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
    
    def __str__(self):
        return "\ADAGRAD has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)
            
    def comp_update(self, theta):
        
        grad = self.target_GD.grad(theta)
        self.past_sq_grad += np.power(grad, 2)
        
        update = (self.learning_rate/(np.sqrt(self.past_sq_grad) + self.epsilon)) * grad
            
        return update

In [None]:
#<api>
class RMSProp(GradientDescent):
    '''Root Mean Square propagation'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, epsilon=1E-8, learning_rate=0.1,
                num_epochs=20, theta_start=None, stochastic=0):  

        self.gamma = gamma
        self.epsilon = epsilon
        self.avg_sq_grad = np.zeros_like(theta_start, dtype='float')
        
        super(RMSProp, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
        
    def __str__(self):
        return "\nRMSProp has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
    
    def comp_update(self, theta):
        
        grad = self.target_GD.grad(theta)                    
        self.avg_sq_grad = self.gamma * self.avg_sq_grad + (1-self.gamma) * np.power(grad, 2)
        
        update = (self.learning_rate/(np.sqrt(self.avg_sq_grad) + self.epsilon)) * grad
            
        return update

In [None]:
#<api>
class ADAM(GradientDescent):
    '''Adaptive Moment Estimation'''

    def __init__(self, target=widget_targets.MultNorm(), beta1=0.9, beta2=0.999, epsilon=1E-8,
                 learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        
        # initialize moment estimates
        self.est_mom_1 = np.zeros_like(theta_start, dtype='float')
        self.est_mom_2 = np.zeros_like(theta_start, dtype='float')


        super(ADAM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)

        
    def __str__(self):
        return "\nADAM has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
        
    def comp_update(self, theta):
        
        grad = self.target_GD.grad(theta)
                    
        self.est_mom_1 = self.beta1 * self.est_mom_1 + (1-self.beta1) * grad
        self.est_mom_2 = self.beta2 * self.est_mom_2 + (1-self.beta2) * np.power(grad, 2)
            
        # bias corrected decaying averages
        unbiased_est_mom_1 = self.est_mom_1/(1 - np.power(self.beta1, self.epochID+1))
        unbiased_est_mom_2 = self.est_mom_2/(1 - np.power(self.beta2, self.epochID+1))

        update = self.learning_rate/(np.sqrt(unbiased_est_mom_2) + self.epsilon) \
                                * unbiased_est_mom_1
            
        return update

# GradientDescentMethod

In [None]:
#<api>
class GradientDescentMethod(widget_methods.DrawingMethod):
    '''
    A parent class for different gradient methods, used 
    specifically by animation widget. This class is required as 
    a wrapper around the gradient descent classes to make 
    resetting in the animation widget easier.
    '''  
          
    def __init__(self, target=widget_targets.MultNorm(), learning_rate=0.1, 
                 num_epochs=20, theta_start=None, stochastic=0):
        self.target = target
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

        if theta_start is None:
            self.theta_start = np.array([1.5,-0.05])
        else:
            self.theta_start = theta_start
        
        self.stochastic = stochastic
        
        self.reset_gradient_descent()    
    
    def reset_gradient_descent(self):
        '''
        Resets the gradient descent object with given parameters. 
        Must be implemented by every child class.
        '''
        pass
    
    def draw(self, start_point=None):  
        '''
        Performs the actual iteration of gradient descent with 
        the object that was specified in reset_gradient_descent().
        '''
        self.gradient_descent.reset_start(x=start_point)
        self.data, self.warning = self.gradient_descent.perform_gradient_descent()
        return {'accepted points' : self.data, 'warning' : self.warning, 
                'epochs' : self.num_epochs, 'learning_rate': self.learning_rate}

    def __str__(self):
        return self.gradient_descent.__str__()
    
    def set_param(self, param_dict):
        '''
        Allows to set additional gradient descent parameters, given from extra_widget
        '''
        for i in param_dict:
            if i =='learning_rate':
                self.learning_rate = param_dict['learning_rate']
            elif i=='epochs':
                self.num_epochs = param_dict['epochs']
            elif i=='target':
                self.target = param_dict['target']
            elif i=='x':                
                self.theta_start[0] = param_dict['x']
            elif i=='y':                
                self.theta_start[1] = param_dict['y']
            elif i=='gamma':
                self.gamma = param_dict['gamma']
            elif i=='beta1':
                self.beta1 = param_dict['beta1']
            elif i=='beta2':
                self.beta2 = param_dict['beta2']
            elif i=='stochastic':
                self.stochastic = param_dict['stochastic']

In [None]:
#<api>
class VanillaGDM(GradientDescentMethod):
    '''Vanilla, aka batch gradient descent method class'''
           
    def reset_gradient_descent(self): 
        self.gradient_descent = VanillaGD(self.target, self.learning_rate, self.num_epochs, self.theta_start,
                                          self.stochastic)

In [None]:
#<api>
class MomentumGDM(GradientDescentMethod):
    '''GD with Momentum method.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9,
                 learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        
        self.gamma = gamma
        
        super(MomentumGDM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
           
    def reset_gradient_descent(self): 
        self.gradient_descent = MomentumGD(self.target, self.gamma, self.learning_rate,
                                           self.num_epochs, self.theta_start, self.stochastic)

In [None]:
#<api>
class NesterovGDM(GradientDescentMethod):
    '''GD with Nesterov method.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9,
                        learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        
        self.gamma = gamma
        
        super(NesterovGDM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
           
    def reset_gradient_descent(self): 
        self.gradient_descent = NesterovGD(self.target, self.gamma, self.learning_rate,
                                           self.num_epochs, self.theta_start, self.stochastic)

In [None]:
#<api>
class ADAGRADM(GradientDescentMethod):
    '''Adagrad method'''
    
    def __init__(self, target=widget_targets.MultNorm(), epsilon=1E-8, 
                         learning_rate=0.01, num_epochs=20, theta_start=None, stochastic=0):
        
        self.epsilon = epsilon
        
        super(ADAGRADM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
    
    def reset_gradient_descent(self): 
        self.gradient_descent = ADAGRAD(self.target, self.epsilon, self.learning_rate, self.num_epochs, 
                                        self.theta_start, self.stochastic)

In [None]:
#<api>
class RMSPropM(GradientDescentMethod):
    '''Root Mean Square Propagation method.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, epsilon=1E-8, 
                         learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):

        self.gamma = gamma
        self.epsilon = epsilon
        
        super(RMSPropM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
            
    def reset_gradient_descent(self): 
        self.gradient_descent = RMSProp(self.target, self.gamma, self.epsilon, self.learning_rate, 
                                        self.num_epochs, self.theta_start, self.stochastic)

In [None]:
#<api>
class ADAMM(GradientDescentMethod):
    '''Adaptive Moment Estimation Method'''

    def __init__(self, target=widget_targets.MultNorm(), beta1=0.9, beta2=0.999, epsilon=1E-8, 
                         learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        super(ADAMM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
           
    def reset_gradient_descent(self): 
        self.gradient_descent = ADAM(self.target, self.beta1, self.beta2, self.epsilon, 
                                     self.learning_rate, self.num_epochs, self.theta_start, 
                                     self.stochastic)