In [1]:
import numpy as np

$$ \theta_{t+1} = \theta_{t} - \eta\frac{\partial{L}}{\partial{\theta_{t}}} $$

In [2]:
class SGD:
    '''
    確率的勾配降下法
    '''
    def __init__(self, lr=0.01):
        self.lr = lr
    
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

$$ v_{t+1} = \alpha_{t}v_{t} - \eta\frac{\partial{L}}{\partial{\theta_{t}}} $$
$$ \theta_{t+1} = \theta_{t} + v_{t+1} $$

In [3]:
class Momentum:
    '''
    モーメンタム
    '''
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
    
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
        
        for key in params.key():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

更新式は以下の通り
$$ v_{t+1} = \alpha_{t}v_{t} - \eta\frac{\partial{L}}{\partial{(\theta_{t}} + \alpha v_{t})} $$
$$ \theta_{t+1} = \theta_{t} + v_{t+1} $$

実装時は以下の通り
$$ \Theta_{t+1} = \theta_{t} + \alpha v_{t+1} $$
$$ v_{t+1} = \alpha_{t}v_{t} - \eta\frac{\partial{L}}{\partial{\Theta_{t}}} $$
$$ \Theta_{t+1} = \Theta_{t} + \alpha^2 v_{t} - (1 + \alpha) \eta\frac{\partial{L}}{\partial{\Theta_{t}}}  $$

In [4]:
class Nesterov:
    '''
    ネステロフのモーメンタム
    '''
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
    
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
        
        for key in params.keys():
            params[key] += self.momentum * self.momentum * self.v[key]
            params[key] -= (1 + self.momentum) * self.lr * grads[key]
            self.v[key] *= self.momentum
            self.v[key] -= self.lr * grads[key]

$$ h_{t+1} = h_{t} + \boldsymbol{\frac{\partial{L}}{\partial{\theta_{t}}}}\circ\boldsymbol{\frac{\partial{L}}{\partial{\theta_{t}}}} $$
$$ \theta_{t+1} = \theta_{t} - \eta \boldsymbol{\frac{1}{ϵ + \sqrt{h_{t+1}}}}\circ\boldsymbol{\frac{\partial{L}}{\partial{}\theta_{t}}} $$

In [5]:
class AdaGrad:
    '''
    AdaGrad
    '''
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
    
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

$$ h_{t+1} = \rho h_{t} + (1 - \rho) \boldsymbol{\frac{\partial{L}}{\partial{\theta_{t}}}}\circ\boldsymbol{\frac{\partial{L}}{\partial{\theta_{t}}}} $$
$$ \theta_{t+1} = \theta_{t} - \eta \boldsymbol{\frac{1}{ϵ + \sqrt{h_{t+1}}}}\circ\boldsymbol{\frac{\partial{L}}{\partial{}\theta_{t}}} $$

In [6]:
class RMSProp:
    '''
    RMSProp
    '''
    def __init__(self, lr=0.01, decay_rate=0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None
    
    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.h[key] *= self.decay_rate
            self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key] + 1e-7))

Adamにおけるパラメータの更新式は以下の通り
$$ m_{t+1} = \rho_{1} m_{t} + (1 - \rho_{1}) \frac{\partial{L}}{\partial{\theta_{t}}} $$
$$ v_{t+1} = \rho_{2} v_{t} + (1 - \rho_{2}) \boldsymbol{\frac{\partial{L}}{\partial{}\theta_{t}}}\circ\boldsymbol{\frac{\partial{L}}{\partial{}\theta_{t}}} $$

およびバイアスに修正
$$  \hat{m_{t+1}} = \frac{m_{t+1}}{1 - \rho^{t}_{1}} $$
$$  \hat{v_{t+1}} = \frac{v_{t+1}}{1 - \rho^{t}_{2}} $$

を用いて、
$$ \theta_{t+1} = \theta_{t} - \eta \frac{1}{\sqrt{\hat{v_{t+1}}} + \epsilon} \circ \hat{m_{t+1}} $$

In [7]:
class Adam:
    '''
    Adam
    '''
    def __init__(self, lr=0.001, rho1=0.9, rho2=0.999):
        self.lr = lr
        self.rho1 = rho1
        self.rho2 = rho2
        self.iter = 0
        self.m = None
        self.v = None
        self.epsilon = 1e-8
    
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1

        for key in params.keys():
            self.m[key] = self.rho1 * self.m[key] + (1 - self.rho1) * grads[key]
            self.v[key] = self.rho2 * self.v[key] + (1 - self.rho2) * (grads[key] ** 2)

            m = self.m[key] / (1 - self.rho1 ** self.iter)
            v = self.v[key] / (1 - self.rho2 ** self.iter)

            params[key] -= self.lr * m / (np.sqrt(v) * self.epsilon)