# Optimizer
深度学习包括模型、优化器、损失函数<br/>
实现以下函数：
- _create_slots         定义内部变量
- _prepare_local        更新内部参数
- _resource_apply_dense 反向传播，计算梯度

目标：实现Adam算法

In [None]:
class Adam(Optimizer):
    def __init__(self, learning_rate=0.1, beta1=0.9, beta2=0.999, eps=1e-7, name="Adam"):
        super(Adam, self).__init__(name)
        self.learning_rate = learning_rate
        self.beta1         = beta1
        self.beta2         = beta2
        self.eps           = eps
    
    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm1')
        for var in var_list:
            self.add_slot(var, 'v1')
    
    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        # With {\beta_1^t} and {\beta_2^t} we denote {\beta_1} and {\beta_2} to the power t.
        beta1_power = tf.pow(self.beta1, local_step)
        beta2_power = tf.pow(self.beta2, local_step)
        apply_state[(var_device, var_dtype)].update(
            dict(
                beta1_power=beta1_power,
                beta2_power=beta2_power,
            )
        )
    
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = apply_state.get((var_device, var_dtype))
        beta1_power = coefficients['beta1_power']
        beta2_power = coefficients['beta2_power']
        v = self.get_slot(var, "v1")
        v_t = v.assign(self.beta2 * v + (1. - self.beta2) * grad**2)
        m = self.get_slot(var, "m1")
        m_t = m.assign(self.beta1 * m + (1. - self.beta1) * grad)
        # Note that the efficiency of algorithm 1 can, at the expense of clarity, 
        # be improved upon by changing the order of computation
        alpha_t =  tf.sqrt(1 - beta2_power) / (1 - beta1_power)
        g_t =  (m_t*alpha_t) / (tf.sqrt(v_t) + self.eps)
        var_update = state_ops.assign_sub(var, self.learning_rate * g_t)
        return tf.group(*[var_update, v_t, m_t])

    def _resource_apply_sparse(self, grad, var):
        raise NotImplementedError("Sparse gradient updates are not supported.")