## Disclaimer
Implementation of gradient descent classes, using previous classes for sampling.
I tried to keep the two classes Drawing and DrawingMethod and used a structure similar to the SamplingMethods.

In [1]:
#<api>
import numpy as np
from jupyter_cms.loader import load_notebook

%matplotlib inline

In [2]:
# <api>
import os
path = os.getcwd()
s = '/'
pardir = s.join(path.split(s)[:-1])

# Load source notebooks
widget_targets = load_notebook(str(pardir + '/widgets/Widget_targets.ipynb'))
widget_methods = load_notebook(str(pardir + '/widgets/Widget_methods.ipynb'))

In [2]:
#<api>
class GradientDescent(widget_methods.Drawing):
    '''
    Base class for all gradient descent algorithms
    '''
    def __init__(self, target=widget_targets.MultNorm(), learning_rate=0.1, 
                 num_epochs=20, theta_start=[0,0], stochastic=0):
        self.target = target
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.theta_start = theta_start
        self.epochID = 0
        # initialize update vector
        self.update = np.zeros_like(theta_start, dtype='float')
        # list to store updated parameters
        self.thetas=[]
        self.num_samples = 0
        self.accepted = 0
        # warning boolean, sent to widget
        self.warning = False
        self.stochastic = stochastic
        self.stochastic_std = 5

    def __str__(self):
        pass
        
    def finalize_gradient_descent(self):
        '''remove those parameters outside of boundaries, 
        set count values, return numpy array of thetas'''
        
        thetas = np.array(self.thetas)
        
        # number of computed steps
        self.num_samples = thetas.shape[0]
        
        # only keep those steps that lie within the boundaries of the target distribution
        thetas = thetas[(np.max(thetas, axis=1) < self.target.get_size())\
                                                & (np.min(thetas, axis=1) > - self.target.get_size())]
        # number of steps to be plotted
        self.accepted = thetas.shape[0]
        
        self.thetas = thetas
        
        #print(self.__str__())
        
    def perform_gradient_descent(self):
        '''iterate through samples'''
        
        theta = self.theta_start
                
        # perform updates
        for self.epochID in np.arange(self.num_epochs):
            
            self.thetas.append(theta)
            
            # compute update
            self.update = self.comp_update(theta)
            
            # update parameters
            theta = theta - self.update
            
        # catch for inf thetas due to inf in gradient
        if np.any(np.isinf(self.thetas)):
            self.warning = True

        self.finalize_gradient_descent()
        
        return self.thetas, self.warning
    
    def comp_update(self, theta):
        '''compute individual update for one step, must be implemented by child class'''
        pass
    
    def grad(self, theta):
        '''wrapper around target gradient method, specific for gradient descent objects
        as all target classes defined for ascent. Also adds Gaussian noise for 
        stochastic GD if self.stochastic = 1.'''
        return -1 * self.target.grad(theta) + self.stochastic * np.random.normal(loc=0., scale=self.stochastic_std)

In [3]:
#<api>
class VanillaGD(GradientDescent):
    '''Vanilla, aka batch gradient descent class'''

    def __str__(self):
        return "\nVanilla (aka batch) gradient descent has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
       
    def comp_update(self, theta):
        
        update = self.learning_rate * self.grad(theta)
        
        return update

In [None]:
#<api>
class MomentumGD(GradientDescent):
    '''Stochastic Gradient Descent with momentum.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, learning_rate=0.1,
                num_epochs=20, theta_start=[0,0], stochastic=0):
        
        self.gamma = gamma
        
        super(MomentumGD, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
        
    def __str__(self):
        return "\nMomentum gradient descent has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
            
    def comp_update(self, theta):
        
        update = self.gamma * self.update + self.learning_rate * self.grad(theta)
        
        return update

In [None]:
#<api>
class NesterovGD(GradientDescent):
    '''Nesterov accelerated gradient'''
    
    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, learning_rate=0.1,
                num_epochs=20, theta_start=[0,0], stochastic=0):
        
        self.gamma = gamma
        
        super(NesterovGD, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
    
    def __str__(self):
        return "\nNesterov gradient descent has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)
            
    def comp_update(self, theta):
            
        update = self.gamma * self.update + \
                    self.learning_rate * self.grad(theta - self.gamma * self.update)
            
        return update

In [None]:
#<api>
class ADAGRAD(GradientDescent):
    '''Adagrad'''
    
    def __init__(self, target=widget_targets.MultNorm(), epsilon=1E-8, learning_rate=0.01,
                num_epochs=20, theta_start=[0,0], stochastic=0):

        self.epsilon = epsilon
        self.past_sq_grad = np.zeros_like(theta_start, dtype='float')
        
        super(ADAGRAD, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
    
    def __str__(self):
        return "\ADAGRAD has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)
            
    def comp_update(self, theta):
            
        grad = self.grad(theta)
        
        self.past_sq_grad += np.power(grad, 2)
        
        update = (self.learning_rate/(np.sqrt(self.past_sq_grad) + self.epsilon)) * grad
            
        return update

In [None]:
#<api>
class RMSProp(GradientDescent):
    '''Root Mean Square propagation'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, epsilon=1E-8, learning_rate=0.1,
                num_epochs=20, theta_start=[0,0], stochastic=0):  

        self.gamma = gamma
        self.epsilon = epsilon
        self.avg_sq_grad = np.zeros_like(theta_start, dtype='float')
        
        super(RMSProp, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
        
    def __str__(self):
        return "\nRMSProp has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
    
    def comp_update(self, theta):
            
        grad = self.grad(theta)
        
        self.avg_sq_grad = self.gamma * self.avg_sq_grad + (1-self.gamma) * np.power(grad, 2)
        
        update = (self.learning_rate/(np.sqrt(self.avg_sq_grad) + self.epsilon)) * grad
            
        return update

In [None]:
#<api>
class ADAM(GradientDescent):
    '''Adaptive Moment Estimation'''

    def __init__(self, target=widget_targets.MultNorm(), beta1=0.9, beta2=0.999, epsilon=1E-8,
                 learning_rate=0.1, num_epochs=20, theta_start=[0,0], stochastic=0):
        
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        
        # initialize moment estimates
        self.est_mom_1 = np.zeros_like(theta_start, dtype='float')
        self.est_mom_2 = np.zeros_like(theta_start, dtype='float')


        super(ADAM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)

        
    def __str__(self):
        return "\nADAM has performed "+\
            "%d steps with %d lieing within the depicted boundaries." % (self.num_samples, self.accepted)       
        
    def comp_update(self, theta):
            
        grad = self.grad(theta)
        
        self.est_mom_1 = self.beta1 * self.est_mom_1 + (1-self.beta1) * grad
        self.est_mom_2 = self.beta2 * self.est_mom_2 + (1-self.beta2) * np.power(grad, 2)
            
        # bias corrected decaying averages
        unbiased_est_mom_1 = self.est_mom_1/(1 - np.power(self.beta1, self.epochID+1))
        unbiased_est_mom_2 = self.est_mom_2/(1 - np.power(self.beta2, self.epochID+1))

        update = self.learning_rate/(np.sqrt(unbiased_est_mom_2) + self.epsilon) \
                                * unbiased_est_mom_1
            
        return update

In [4]:
#<api>
class GradientDescentMethod(widget_methods.DrawingMethod):
    '''A class of different gradient methods for animation widget.'''  
          
    def __init__(self, target=widget_targets.MultNorm(), learning_rate=0.1, 
                 num_epochs=20, theta_start=None, stochastic=0):
        self.target = target
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs

        if theta_start is None:
            self.theta_start = np.array([1.5,-0.05])
        else:
            self.theta_start = theta_start
        
        self.stochastic = stochastic
        
        self.reset_gradient_descent()    
    
    def reset_gradient_descent(self):
        self.gradient_descent = VanillaGD(self.target, self.learning_rate, self.num_epochs, self.theta_start,
                                          self.stochastic)
    
    def draw(self):        
        self.data, self.warning = self.gradient_descent.perform_gradient_descent()
        return {'accepted points' : self.data, 'warning' : self.warning}

    def __str__(self):
        return self.gradient_descent.__str__()
    
    def set_param(self, param_dict):
        '''Allows to set additional gradient descent parameters, given from extra_widget'''
        for i in param_dict:
            if i =='learning_rate':
                self.learning_rate = param_dict['learning_rate']
            elif i=='epochs':
                self.num_epochs = param_dict['epochs']
            elif i=='target':
                self.target = param_dict['target']
            elif i=='x':
                #y = self.theta_start[1]
                #self.theta_start = np.array([param_dict['x'],y])
                self.theta_start[0] = param_dict['x']
            elif i=='y':
                #x = self.theta_start[0]
                #self.theta_start = np.array([x,param_dict['y']])
                self.theta_start[1] = param_dict['y']
            elif i=='gamma':
                self.gamma = param_dict['gamma']
            elif i=='beta1':
                self.beta1 = param_dict['beta1']
            elif i=='beta2':
                self.beta2 = param_dict['beta2']
            elif i=='stochastic':
                self.stochastic = param_dict['stochastic']

In [5]:
#<api>
class VanillaGDM(GradientDescentMethod):
    '''Vanilla, aka batch gradient descent method class'''
           
    def reset_gradient_descent(self): 
        self.gradient_descent = VanillaGD(self.target, self.learning_rate, self.num_epochs, self.theta_start,
                                          self.stochastic)

In [None]:
#<api>
class MomentumGDM(GradientDescentMethod):
    '''SGD with Momentum method.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9,
                 learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        
        self.gamma = gamma
        
        super(MomentumGDM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
           
    def reset_gradient_descent(self): 
        self.gradient_descent = MomentumGD(self.target, self.gamma, self.learning_rate,
                                           self.num_epochs, self.theta_start, self.stochastic)

In [None]:
#<api>
class NesterovGDM(GradientDescentMethod):
    '''SGD with Nesterov method.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9,
                        learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        
        self.gamma = gamma
        
        super(NesterovGDM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
           
    def reset_gradient_descent(self): 
        self.gradient_descent = NesterovGD(self.target, self.gamma, self.learning_rate,
                                           self.num_epochs, self.theta_start, self.stochastic)

In [None]:
#<api>
class ADAGRADM(GradientDescentMethod):
    '''Adagrad method'''
    
    def __init__(self, target=widget_targets.MultNorm(), epsilon=1E-8, 
                         learning_rate=0.01, num_epochs=20, theta_start=None, stochastic=0):
        
        self.epsilon = epsilon
        
        super(ADAGRADM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
    
    def reset_gradient_descent(self): 
        self.gradient_descent = ADAGRAD(self.target, self.epsilon, self.learning_rate, self.num_epochs, 
                                        self.theta_start, self.stochastic)

In [None]:
#<api>
class RMSPropM(GradientDescentMethod):
    '''Root Mean Square Propagation method.'''

    def __init__(self, target=widget_targets.MultNorm(), gamma=0.9, epsilon=1E-8, 
                         learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):

        self.gamma = gamma
        self.epsilon = epsilon
        
        super(RMSPropM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
            
    def reset_gradient_descent(self): 
        self.gradient_descent = RMSProp(self.target, self.gamma, self.epsilon, self.learning_rate, 
                                        self.num_epochs, self.theta_start, self.stochastic)

In [None]:
#<api>
class ADAMM(GradientDescentMethod):
    '''Adaptive Moment Estimation Method'''

    def __init__(self, target=widget_targets.MultNorm(), beta1=0.9, beta2=0.999, epsilon=1E-8, 
                         learning_rate=0.1, num_epochs=20, theta_start=None, stochastic=0):
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        super(ADAMM, self).__init__(target, learning_rate, num_epochs, theta_start, stochastic)
           
    def reset_gradient_descent(self): 
        self.gradient_descent = ADAM(self.target, self.beta1, self.beta2, self.epsilon, 
                                     self.learning_rate, self.num_epochs, self.theta_start, 
                                     self.stochastic)