## L2正则化

In [1]:
import torch
import numpy as np

np.random.seed(123)
np.set_printoptions(8, suppress=True)

x_numpy = np.random.random((3, 4)).astype(np.double)
x_torch = torch.tensor(x_numpy, requires_grad=True)
x_torch2 = torch.tensor(x_numpy, requires_grad=True)

w_numpy = np.random.random((4, 5)).astype(np.double)
w_torch = torch.tensor(w_numpy, requires_grad=True)
w_torch2 = torch.tensor(w_numpy, requires_grad=True)

lr = 0.01
weight_decay = 0.9
sgd = torch.optim.SGD([w_torch], lr=lr, weight_decay=0)
sgd2 = torch.optim.SGD([w_torch2], lr=lr, weight_decay=weight_decay)

y_torch = torch.matmul(x_torch, w_torch)
y_torch2 = torch.matmul(x_torch2, w_torch2)

loss = y_torch.sum()
loss2 = y_torch2.sum()

sgd.zero_grad()
sgd2.zero_grad()

loss.backward()
loss2.backward()

sgd.step()
sgd2.step()

w_grad = w_torch.grad.data.numpy()
w_grad2 = w_torch2.grad.data.numpy()

print("check_grad")
print(w_grad + weight_decay * w_numpy)
print(w_grad2)

check_grad
[[2.29158508 1.95058016 2.25510989 2.56106592 2.06111261]
 [1.25926989 1.57975955 1.58000814 1.67232418 1.86585193]
 [2.20280346 2.10071483 2.20099271 1.84145669 1.87640346]
 [2.17063112 2.22953686 2.53307273 2.04808866 2.35552527]]
[[1.89687006 1.89687006 1.89687006 1.89687006 1.89687006]
 [1.10136331 1.10136331 1.10136331 1.10136331 1.10136331]
 [1.55079367 1.55079367 1.55079367 1.55079367 1.55079367]
 [1.96519422 1.96519422 1.96519422 1.96519422 1.96519422]]


## SGD

In [2]:
import numpy

class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
    
    def __call__(self, params, grads):
        params -= self.lr * grads
    
    
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def __call__(self, params, grads):
        if self.v is None:
            self.v = np.zeros_like(params)
        
        self.v = self.momentum * self.v + grads
        params -= self.lr * self.v
        

class RMSProp:
    def __init__(self, lr=0.01, alpha=0.9, eps=1e-08):
        self.lr = lr
        self.alpha = alpha
        self.eps = eps
        self.v = None

    def __call__(self, params, grads):
        if self.v is None:
            self.v = np.zeros_like(params)

        self.v = self.alpha * self.v
        self.v += (1 - self.alpha) * np.square(grads)
        eta = self.lr / (np.sqrt(self.v) + self.eps)
        params -= eta * grads
        

class Adam:
    def __init__(self, lr=0.01, betas=(0.9, 0.999), eps=1e-08):
        self.lr = lr
        self.beta1 = betas[0]
        self.beta2 = betas[1]
        self.eps = eps
        self.m = None
        self.v = None
        self.n = 0
        
    def __call__(self, params, grads):
        if self.m is None:
            self.m = np.zeros_like(params)
        if self.v is None:
            self.v = np.zeros_like(params)
            
        self.n += 1
        
        self.m = self.beta1 * self.m + (1 - self.beta1) * grads
        self.v = self.beta2 * self.v + (1 - self.beta2) * grads
        
        Mt = self.m / (1 - np.power(self.beta1, self.n))
        Vt = self.v / (1 - np.power(self.beta2, self.n))
        
        eta = self.lr / (np.sqrt(Vt) + self.eps)
        params -= eta * Mt
        

In [3]:
import torch
import numpy as np

In [4]:
def check_optim(optim_numpy, optim_torch, p, p_torch):
    """
    check with y = p * x^2
    optim param p
    """
    x_size = 5
    x = np.random.random(x_size)
    x_torch = torch.tensor(x, requires_grad=True)
    
    dxi_numpy_list = []
    for i in range(x_size):
        yi_numpy = p * x[i] ** 2
        dxi_numpy = 2 * p * x[i]
        dxi_numpy_list.append(dxi_numpy)

        da = x[i] ** 2
        optim_numpy(p, da)
        
    for i in range(x_size):
        yi_torch = p_torch * x_torch[i] ** 2
        optim_torch.zero_grad()
        yi_torch.backward()
        optim_torch.step()

    print(np.array(dxi_numpy_list))
    print(x_torch.grad.data.numpy())

In [5]:
np.random.seed(123)
np.set_printoptions(precision=12, suppress=True, linewidth=80)

print("--- 检查SGD ---")
a_numpy = np.array(1.2)
a_torch = torch.tensor(a_numpy, requires_grad=True)
sgd_numpy = SGD(0.1)
sgd_torch = torch.optim.SGD([a_torch], lr=0.1)
check_optim(sgd_numpy, sgd_torch, a_numpy, a_torch)

print("--- 检查Momentum ---")
a_numpy = np.array(1.2)
a_torch = torch.tensor(a_numpy, requires_grad=True)
momentum_numpy = Momentum(0.1, 0.9)
momentum_torch = torch.optim.SGD([a_torch], lr=0.1, momentum=0.9)
check_optim(momentum_numpy, momentum_torch, a_numpy, a_torch)

print("--- 检查RMSProp ---")
a_numpy = np.array(1.2)
a_torch = torch.tensor(a_numpy, requires_grad=True)
rms_numpy = RMSProp(0.1, 0.9, eps=1e-08)
rms_torch = torch.optim.RMSprop([a_torch], lr=0.1, alpha=0.9)
check_optim(rms_numpy, rms_torch, a_numpy, a_torch)

print("--- 检查Adam ---")
a_numpy = np.array(1.2)
a_torch = torch.tensor(a_numpy, requires_grad=True)
adam_numpy = Adam(lr=0.1, betas=(0.9, 0.99), eps=0.001)
adam_torch = torch.optim.Adam([a_torch], lr=0.1, betas=(0.9, 0.99), eps=0.001)
check_optim(adam_numpy, adam_torch, a_numpy, a_torch)


--- 检查SGD ---
[1.671526045435 0.658974920984 0.518721027022 1.254968104394 1.594004424417]
[1.671526045435 0.658974920984 0.518721027022 1.254968104394 1.594004424417]
--- 检查Momentum ---
[1.015455504299 2.318718975892 1.46525696166  0.886671018481 0.600349866658]
[1.015455504299 2.318718975892 1.46525696166  0.886671018481 0.600349866658]
--- 检查RMSProp ---
[0.823627238762 1.288627900967 0.503750904131 0.055347017535 0.367439505846]
[0.823627238762 1.288627900967 0.503750904131 0.055347017535 0.367439505846]
--- 检查Adam ---
[1.771188973757 0.411080990314 0.377231396585 1.099067114457 1.051522639875]
[1.771188973757 0.40154869823  0.361003567239 1.031188611276 0.957820725513]


## Batch Normalization

In [6]:
import torch
import numpy as np


class BatchNorm1d:
    def __init__(self):
        self.eps = 1e-5
        self.weight = None
        self.bias = None

        self.num = None
        self.std = None
        self.dw = None
        self.db = None

    def __call__(self, x):
        self.num = np.shape(x)[0]
        mean = np.mean(x, axis=0, keepdims=True)
        var = np.var(x, axis=0, keepdims=True)
        self.sqrt = np.sqrt(var + self.eps)
        self.std = (x - mean) / self.sqrt
        out = self.std * self.weight + self.bias
        return out

    def backward(self, d_loss):
        std_t = self.std.T
        shape_t = np.shape(std_t)
        r = np.zeros([shape_t[0], shape_t[1], shape_t[1]])
        shift_eye = np.eye(shape_t[1]) * shape_t[1] - 1
        for i in range(shape_t[0]):
            r[i] = std_t[i][:, np.newaxis] * std_t[i][np.newaxis, :]
            r[i] = shift_eye - r[i]

        u = self.weight / shape_t[1] / self.sqrt
        u = u.T
        y = r * u[:, np.newaxis]

        dx = np.zeros(shape_t)
        for i in range(shape_t[0]):
            dx[i] = np.dot(d_loss.T[i], y[i])
        dx = dx.T

        self.dw = np.sum(self.std * d_loss, axis=0)
        self.db = np.sum(d_loss, axis=0)

        return dx


np.set_printoptions(precision=8, suppress=True, linewidth=120)
np.random.seed(123)
torch.random.manual_seed(123)

x_numpy = np.array(np.random.random((3, 5)), dtype=np.float64)
weight_numpy = np.array(np.random.random((5,)), dtype=np.float64)
bias_numpy = np.array(np.random.random((5,)), dtype=np.float64)
d_loss_numpy = np.array(np.random.random((3, 5)), dtype=np.float64)

x_tensor = torch.tensor(x_numpy, requires_grad=True)
weight_tensor = torch.tensor(weight_numpy, requires_grad=True)
bias_tensor = torch.tensor(bias_numpy, requires_grad=True)
d_loss_tensor = torch.tensor(d_loss_numpy, requires_grad=True)

batch_norm_numpy = BatchNorm1d()
batch_norm_numpy.weight = weight_numpy
batch_norm_numpy.bias = bias_numpy

batch_norm_tensor = torch.nn.BatchNorm1d(5).double()
batch_norm_tensor.weight = torch.nn.Parameter(weight_tensor, requires_grad=True)
batch_norm_tensor.bias = torch.nn.Parameter(bias_tensor, requires_grad=True)

output_numpy = batch_norm_numpy(x_numpy)
output_tensor = batch_norm_tensor(x_tensor)
output_tensor.backward(d_loss_tensor)

dx_numpy = batch_norm_numpy.backward(d_loss_numpy)
dx_tensor = x_tensor.grad

dw_numpy = batch_norm_numpy.dw
dw_tensor = batch_norm_tensor.weight.grad

db_numpy = batch_norm_numpy.db
db_tensor = batch_norm_tensor.bias.grad

print("output_numpy \n", output_numpy)
print("output_tensor \n", output_tensor.data.numpy())

print("dx_numpy \n", dx_numpy)
print("dx_tensor \n", dx_tensor.data.numpy())

print("dw_numpy \n", dw_numpy)
print("dw_tensor \n", dw_tensor.data.numpy())

print("db_numpy \n", db_numpy)
print("db_tensor \n", db_tensor.data.numpy())

output_numpy 
 [[ 1.65328862  0.60845792  0.51520137  1.06970231  1.47430633]
 [ 0.31989274  1.049903    0.94450061  0.89737849  0.33620922]
 [-0.06997848  0.88993446  0.713664   -0.13401027  0.3568146 ]]
output_tensor 
 [[ 1.65328862  0.60845792  0.51520137  1.06970231  1.47430633]
 [ 0.31989274  1.049903    0.94450061  0.89737849  0.33620922]
 [-0.06997848  0.88993446  0.713664   -0.13401027  0.3568146 ]]
dx_numpy 
 [[ 0.14897849 -0.00280487 -0.19168465 -0.12787269 -0.00214988]
 [-0.65806077 -0.00492716 -0.16475823  0.14902235 -0.12571213]
 [ 0.50908229  0.00773203  0.35644287 -0.02114966  0.127862  ]]
dx_tensor 
 [[ 0.14897849 -0.00280487 -0.19168465 -0.12787269 -0.00214988]
 [-0.65806077 -0.00492716 -0.16475823  0.14902235 -0.12571213]
 [ 0.50908229  0.00773203  0.35644287 -0.02114966  0.127862  ]]
dw_numpy 
 [ 0.10859242  0.09332669  0.21318384 -0.80395148  0.23776771]
dw_tensor 
 [ 0.10859242  0.09332669  0.21318384 -0.80395148  0.23776771]
db_numpy 
 [0.72732508 1.22184114 1.552

In [7]:
import numpy as np


class BatchNorm1d:
    def __init__(self, train=True, momentum=0.1, eps=1e-5):
        self.train = train
        self.momentum = momentum
        self.eps = eps

        self.weight = None
        self.bias = None

        self.std = None
        self.dw = None
        self.db = None

        self.sqrt = None
        self.std = None

        self.running_mean = None
        self.running_var = None

    def __call__(self, x):
        if self.train is True:
            mean = np.mean(x, axis=0, keepdims=True)
            var = np.var(x, axis=0, keepdims=True)
            sqrt = np.sqrt(var + self.eps)
            std = (x - mean) / sqrt
            self.sqrt = sqrt
            self.std = std

            if self.running_mean is None:
                self.running_mean = np.zeros_like(mean)
                self.running_var = np.ones_like(var)

            num = np.shape(x)[0]
            self.running_mean = (1 - self.momentum) * self.running_mean
            self.running_mean += self.momentum * mean
            self.running_var = (1 - self.momentum) * self.running_var
            self.running_var += self.momentum * var * num / (num - 1)
        else:
            mean = self.running_mean
            var = self.running_var
            sqrt = np.sqrt(var + self.eps)
            std = (x - mean) / sqrt

        out = std * self.weight + self.bias
        return out

    def backward(self, d_loss):
        std_t = self.std.T
        shape_t = np.shape(std_t)
        r = np.zeros([shape_t[0], shape_t[1], shape_t[1]])
        shift_eye = np.eye(shape_t[1]) * shape_t[1] - 1
        for i in range(shape_t[0]):
            r[i] = std_t[i][:, np.newaxis] * std_t[i][np.newaxis, :]
            r[i] = shift_eye - r[i]

        u = self.weight / shape_t[1] / self.sqrt
        u = u.T
        y = r * u[:, np.newaxis]

        dx = np.zeros(shape_t)
        for i in range(shape_t[0]):
            dx[i] = np.dot(d_loss.T[i], y[i])
        dx = dx.T

        self.dw = np.sum(self.std * d_loss, axis=0)
        self.db = np.sum(d_loss, axis=0)

        return dx


In [8]:
np.set_printoptions(precision=8, suppress=True, linewidth=120)
np.random.seed(123)
torch.random.manual_seed(123)
nums = 3

x_numpy = np.array(np.random.random((nums, 3, 5)), dtype=np.float64)
weight_numpy = np.array(np.random.random((5,)), dtype=np.float64)
bias_numpy = np.array(np.random.random((5,)), dtype=np.float64)

x_tensor = torch.tensor(x_numpy, requires_grad=True)
weight_tensor = torch.tensor(weight_numpy, requires_grad=True)
bias_tensor = torch.tensor(bias_numpy, requires_grad=True)

batch_norm_numpy = BatchNorm1d(momentum=0.2)
batch_norm_numpy.weight = weight_numpy
batch_norm_numpy.bias = bias_numpy

batch_norm_tensor = torch.nn.BatchNorm1d(5, momentum=0.2).double()
batch_norm_tensor.weight = torch.nn.Parameter(weight_tensor, requires_grad=True)
batch_norm_tensor.bias = torch.nn.Parameter(bias_tensor, requires_grad=True)

print("output_numpy")
for i in range(nums):
    output_numpy = batch_norm_numpy(x_numpy[i])
    print(output_numpy)

print("output_tensor")
for i in range(nums):
    output_tensor = batch_norm_tensor(x_tensor[i])
    print(output_tensor.data.numpy())

print("eval_numpy")
batch_norm_numpy.train = False
for i in range(nums):
    output_numpy = batch_norm_numpy(x_numpy[i])
    print(output_numpy)

print("eval_tensor")
batch_norm_tensor.eval()
for i in range(nums):
    output_tensor = batch_norm_tensor(x_tensor[i])
    print(output_tensor.data.numpy())

output_numpy
[[ 0.46641115  0.18851114 -0.57237571  0.99333476  1.2092339 ]
 [ 0.01389322  1.35696463  1.83911376  0.82492271 -0.10234434]
 [-0.11841836  0.93354663  0.54244234 -0.18305345 -0.07859806]]
[[ 0.36607296  0.34287603 -0.19635263  0.74842757 -0.41712586]
 [ 0.21899343  1.48596725  1.99143145  1.05458233  1.08250983]
 [-0.22318039  0.65017912  0.01410157 -0.16780589  0.36290753]]
[[-0.16707033  1.17972157  0.14300413  0.26912336 -0.20601642]
 [ 0.08560938  1.15577836  1.97239442  1.27262803  0.03626041]
 [ 0.44334695  0.14352248 -0.30621817  0.09345263  1.19804751]]
output_tensor
[[ 0.46641115  0.18851114 -0.57237571  0.99333476  1.2092339 ]
 [ 0.01389322  1.35696463  1.83911376  0.82492271 -0.10234434]
 [-0.11841836  0.93354663  0.54244234 -0.18305345 -0.07859806]]
[[ 0.36607296  0.34287603 -0.19635263  0.74842757 -0.41712586]
 [ 0.21899343  1.48596725  1.99143145  1.05458233  1.08250983]
 [-0.22318039  0.65017912  0.01410157 -0.16780589  0.36290753]]
[[-0.16707033  1.179721