In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.keras.optimizers import Optimizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile
import os

In [49]:
'''
# Create an optimizer with the desired parameters.
opt = Adam()
var1 = tf.Variable(2)
var2 = tf.Variable(3)
# `loss` is a callable that takes no argument and returns the value
# to minimize.
loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
# In eager mode, simply call minimize to update the list of variables.
opt.minimize(loss, var_list=[var1, var2])
'''
opt = AdamW()
var1 = tf.Variable(10.0)
loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
step_count = opt.minimize(loss, [var1]).numpy()
# The first step is `-learning_rate*sign(grad)`
var1.numpy()

9.875001

In [None]:
# 探索代码


# Code Repo.

### Adam (2014)
http://arxiv.org/abs/1412.6980<br/>
[NOTES] Tensorflow Optimizer 框架内部，类变量 lr 作为保留字屏蔽.<br/>
[NOTES] tf.group 概率图模型相关函数，效果等同于 forkJoin.

In [40]:
class Adam(Optimizer):
    def __init__(self, learning_rate=0.1, beta1=0.9, beta2=0.999, eps=1e-7, name="Adam"):
        super(Adam, self).__init__(name)
        self.learning_rate = learning_rate
        self.beta1         = beta1
        self.beta2         = beta2
        self.eps           = eps
    
    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm1')
        for var in var_list:
            self.add_slot(var, 'v1')
    
    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        # With {\beta_1^t} and {\beta_2^t} we denote {\beta_1} and {\beta_2} to the power t.
        beta1_power = tf.pow(self.beta1, local_step)
        beta2_power = tf.pow(self.beta2, local_step)
        apply_state[(var_device, var_dtype)].update(
            dict(
                beta1_power=beta1_power,
                beta2_power=beta2_power,
            )
        )
    
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = apply_state.get((var_device, var_dtype))
        beta1_power = coefficients['beta1_power']
        beta2_power = coefficients['beta2_power']
        v = self.get_slot(var, "v1")
        v_t = v.assign(self.beta2 * v + (1. - self.beta2) * grad**2)
        m = self.get_slot(var, "m1")
        m_t = m.assign(self.beta1 * m + (1. - self.beta1) * grad)
        # Note that the efficiency of algorithm 1 can, at the expense of clarity, 
        # be improved upon by changing the order of computation
        alpha_t =  tf.sqrt(1 - beta2_power) / (1 - beta1_power)
        g_t =  (m_t*alpha_t) / (tf.sqrt(v_t) + self.eps)
        var_update = state_ops.assign_sub(var, self.learning_rate * g_t)
        return tf.group(*[var_update, v_t, m_t])

    def _resource_apply_sparse(self, grad, var):
        raise NotImplementedError("Sparse gradient updates are not supported.")

### AdaMax (2014)

In [45]:
class AdaMax(Optimizer):
    def __init__(self, learning_rate=0.1, beta1=0.9, beta2=0.999, eps=1e-7, name="Adam"):
        super(AdaMax, self).__init__(name)
        self.learning_rate = learning_rate
        self.beta1         = beta1
        self.beta2         = beta2
        self.eps           = eps
    
    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm1')
        for var in var_list:
            self.add_slot(var, 'v1')
    
    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(AdaMax, self)._prepare_local(var_device, var_dtype, apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        # With {\beta_1^t} and {\beta_2^t} we denote {\beta_1} and {\beta_2} to the power t.
        beta1_power = tf.pow(self.beta1, local_step)
        beta2_power = tf.pow(self.beta2, local_step)
        apply_state[(var_device, var_dtype)].update(
            dict(
                beta1_power=beta1_power,
                beta2_power=beta2_power,
            )
        )
    
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = apply_state.get((var_device, var_dtype))
        beta1_power = coefficients['beta1_power']
        beta2_power = coefficients['beta2_power']
        v = self.get_slot(var, "v1")
        # We can generalize the L2 norm based update rule to a Lp norm based update rule.
        # However, in the special case where we let p -> ∞, a surprisingly simple and 
        # stable algorithm emerges.
        v_t = v.assign(max(self.beta2 * v, abs(grad)))
        m = self.get_slot(var, "m1")
        m_t = m.assign(self.beta1 * m + (1. - self.beta1) * grad)
        # Note that the efficiency of algorithm 1 can, at the expense of clarity, 
        # be improved upon by changing the order of computation
        alpha_t =  1 / (1 - beta1_power)
        g_t =  (m_t*alpha_t) / v_t
        var_update = state_ops.assign_sub(var, self.learning_rate * g_t)
        return tf.group(*[var_update, v_t, m_t])

    def _resource_apply_sparse(self, grad, var):
        raise NotImplementedError("Sparse gradient updates are not supported.")

### AdamW (2019)
https://arxiv.org/abs/1711.05101v3

In [48]:
class AdamW(Optimizer):
    def __init__(self, learning_rate=0.1, beta1=0.9, beta2=0.999, eps=1e-7, weight_decay=0.025, name="Adam"):
        super(AdamW, self).__init__(name)
        self.learning_rate = learning_rate
        self.beta1         = beta1
        self.beta2         = beta2
        self.eps           = eps
        self.weight_decay  = weight_decay
    
    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm1')
        for var in var_list:
            self.add_slot(var, 'v1')
    
    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(AdamW, self)._prepare_local(var_device, var_dtype, apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        # With {\beta_1^t} and {\beta_2^t} we denote {\beta_1} and {\beta_2} to the power t.
        beta1_power = tf.pow(self.beta1, local_step)
        beta2_power = tf.pow(self.beta2, local_step)
        apply_state[(var_device, var_dtype)].update(
            dict(
                beta1_power=beta1_power,
                beta2_power=beta2_power,
            )
        )
    
    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = apply_state.get((var_device, var_dtype))
        beta1_power = coefficients['beta1_power']
        beta2_power = coefficients['beta2_power']
        v = self.get_slot(var, "v1")
        v_t = v.assign(self.beta2 * v + (1. - self.beta2) * grad**2)
        m = self.get_slot(var, "m1")
        m_t = m.assign(self.beta1 * m + (1. - self.beta1) * grad)
        # Note that the efficiency of algorithm 1 can, at the expense of clarity, 
        # be improved upon by changing the order of computation
        alpha_t =  tf.sqrt(1 - beta2_power) / (1 - beta1_power)
        # According to the AdamW paper, learning rate can be fixed, decay, or 
        # also be used for warm restarts (AdamWR to come).
        g_t =  (m_t*alpha_t) / (tf.sqrt(v_t) + self.eps) + self.weight_decay * var
        var_update = state_ops.assign_sub(var, self.learning_rate * g_t)
        return tf.group(*[var_update, v_t, m_t])

    def _resource_apply_sparse(self, grad, var):
        raise NotImplementedError("Sparse gradient updates are not supported.")