In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Optimizers:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        
    def gradient_descent(self, params, gradients):
        """
        Standard Gradient Descent
        Simple but effective for convex problems
        """
        return params - self.learning_rate * gradients
    
    def momentum(self, params, gradients, velocity, momentum=0.9):
        """
        Momentum-based Gradient Descent
        Helps overcome local minima and speeds up convergence
        """
        velocity = momentum * velocity + self.learning_rate * gradients
        return params - velocity, velocity
    
    def rmsprop(self, params, gradients, cache, decay_rate=0.9, epsilon=1e-8):
        """
        RMSprop Optimizer
        Adapts learning rates based on recent gradients
        """
        cache = decay_rate * cache + (1 - decay_rate) * np.square(gradients)
        update = self.learning_rate * gradients / (np.sqrt(cache) + epsilon)
        return params - update, cache
    
    def adam(self, params, gradients, moment, velocity, t, 
            beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Adam Optimizer
        Combines benefits of momentum and RMSprop
        """
        moment = beta1 * moment + (1 - beta1) * gradients
        velocity = beta2 * velocity + (1 - beta2) * np.square(gradients)
        
        # Bias correction
        moment_corrected = moment / (1 - beta1**t)
        velocity_corrected = velocity / (1 - beta2**t)
        
        update = self.learning_rate * moment_corrected / (np.sqrt(velocity_corrected) + epsilon)
        return params - update, moment, velocity