In [1]:
import numpy as np

In [36]:
import numpy as np

class AdamW:
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0):
        self.params = params                  
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.eps = eps
        self.weight_decay = weight_decay

        self.t = 0
        self.state = {}                     

    def step(self, grads):
        self.t += 1

        for param, grad in zip(self.params, grads):
            key = id(param)

            if key not in self.state:
               self.state[key] ={
                   'm': np.zeros_like(param),
                   'v': np.zeros_like(param)
               }

            m = self.state[key]['m']
            v = self.state[key]['v']

            m[:] =  m * self.beta1 + grad * (1 - self.beta1)
            v[:] = v * self.beta2 + (grad ** 2) * (1 - self.beta2)

            m_hat = m / (1 - self.beta1 ** self.t)
            v_hat = v / (1 - self.beta2 ** self.t)

            param -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

            if self.weight_decay != 0:
                if param.ndim > 1:
                    param -= - self.lr * self.weight_decay * param


In [42]:
# input parameters of one nn layer
W = np.array([1.0, -2.0])
b = np.array([0.5])

params = [W, b]

# gradients from backpropagation
dW = np.array([0.3, -0.1])
db = np.array([0.2])

grads = [dW, db]

print('Before AdamW update: ', params)

opt = AdamW(params, lr=0.1, weight_decay=0.1)

opt.step(grads)

print('After AdamW update: ', params)

Before AdamW update:  [array([ 1., -2.]), array([0.5])]
After AdamW update:  [array([ 0.9       , -1.90000001]), array([0.4])]
