In [24]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers, optimizers, Sequential, datasets, losses

### 前向传播

BN 层的输入记作 $x$, 输出记作 $\tilde{x}$

$$\tilde{x}_{train}=\frac{x_{train}-\mu_B}{\sqrt{\sigma^2_B + \epsilon}} . \gamma + \beta$$

对于全局训练数据的统计值为：

$$\mu_r=moment.\mu_r+(1-moment).\mu_B $$

$$\sigma^2_r=moment.\sigma^2_r+(1-moment).\sigma^2_B $$

TF 中 momenttum 为超参，默认为 0.99

测试阶段

**变量值均来自训练阶段统计或优化的结果**，在测试阶段直接使用，不用更新

$$\tilde{x}_{test}=\frac{x_{test}-\mu_r}{\sqrt{\sigma^2_r + \epsilon}} . \gamma + \beta$$

BN 层在测试与训练阶段的行为不同，需通过 training 标志位来区分

In [25]:
# 数据的前处理模块
def preprocess(x, y):
    x = tf.cast(x, dtype=tf.float32) / 255.
    y = tf.cast(y, dtype=tf.int32)
    return x, y

In [26]:
(x_train, y_train), (x_test, y_test) = datasets.mnist.load_data()
# 划分子集的个数
batchsz = 128
train_db = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_db = train_db.map(preprocess).shuffle(10000).batch(batchsz)
test_db = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_db = test_db.map(preprocess).batch(batchsz)

In [27]:
# TF 中，可直接实现 BN 层
# layer = layers.BatchNormalization()
model = Sequential([
    # 个数 6 ，尺寸 3×3
    layers.Conv2D(6, kernel_size=3, strides=1),
    # 添加 BN 层
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=2, strides=2),
    layers.ReLU(),
    
    layers.Conv2D(16, kernel_size=3, strides=1),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=2, strides=2),
    layers.ReLU(),

    layers.Flatten(),
    layers.Dense(120, activation='relu'),
    layers.Dense(84, activation='relu'),
    layers.Dense(10)
])
model.build(input_shape=(None, 32, 32, 1))

In [28]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 30, 30, 6)         60        
_________________________________________________________________
batch_normalization_11 (Batc (None, 30, 30, 6)         24        
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 15, 15, 6)         0         
_________________________________________________________________
re_lu_10 (ReLU)              (None, 15, 15, 6)         0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 13, 13, 16)        880       
_________________________________________________________________
batch_normalization_12 (Batc (None, 13, 13, 16)        64        
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 6, 6, 16)         

In [None]:
optimizer = optimizers.SGD(lr=1e-2)
for x,y in train_db:
    with tf.GradientTape() as tape:
        x = tf.expand_dims(x, axis=3)
        out = model(x, training=True)
        y_onehot = tf.one_hot(y, depth=10)
        loss = tf.reduce_mean(losses.categorical_crossentropy(y_onehot, out))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
        # for x,y in test_db:
        #     x = tf.expand_dims(x, axis=3)
        #     out = model(x, training=False) 