In [11]:
import tensorflow as tf

class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, eps=0.001 ,**kwargs):
        super().__init__(**kwargs)
        self.eps = eps

    def build(self, batch_input_shape):
        self.alpha = self.add_weight(name='alpha',shape=batch_input_shape[-1:], initializer='ones')
        self.beta = self.add_weight(name='beta',shape=batch_input_shape[-1:], initializer='zeors')
        return super().build(batch_input_shape)
    
    def call(self, X, *args, **kwargs):
        mu, variance = tf.nn.moments(X, axes=-1, keepdims=True)
        
        return self.alpha * (X - mu) / (tf.sqrt(variance + self.eps )) + self.beta
    
    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "eps": self.eps}


In [12]:
import numpy as np

(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32) / 255.
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32) / 255.

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax"),
])

In [14]:
def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X), size=batch_size)
    return X[idx], y[idx]


In [15]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [16]:
def print_status_bar(step, total, loss, metrics=None):
    metrics = " - ".join([f"{m.name}: {m.result():.4f}"
                            for m in [loss] + (metrics or [])])
    
    end = "" if step < total else "\n"
    print(f"\r{step}/{total} - " + metrics, end=end)


In [17]:
for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))

    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)

        with tf.GradientTape() as tape:
            y_pred = model(X_batch, training=True)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        mean_loss(loss)

        for metric in metrics:
            metric(y_batch, y_pred)

        print_status_bar(step, n_steps, mean_loss, metrics)

    for metric in [mean_loss] + metrics:
        metric.reset_states()

Epoch 1/5
1718/1718 - mean: 0.4941 - sparse_categorical_accuracy: 0.8235
Epoch 2/5
1718/1718 - mean: 0.4125 - sparse_categorical_accuracy: 0.8506
Epoch 3/5
1718/1718 - mean: 0.3928 - sparse_categorical_accuracy: 0.8582
Epoch 4/5
1718/1718 - mean: 0.3793 - sparse_categorical_accuracy: 0.8638
Epoch 5/5
1718/1718 - mean: 0.3743 - sparse_categorical_accuracy: 0.8667


In [21]:
lower_layers = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu"),
])
upper_layers = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="softmax"),
])
model = tf.keras.Sequential([
    lower_layers, upper_layers
])

In [22]:
lower_optimizer = tf.keras.optimizers.SGD(learning_rate=1e-4)
upper_optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-3)

n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [25]:
for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))

    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)

        with tf.GradientTape(persistent=True) as tape:
            y_pred = model(X_batch, training=True)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        for layers, optimizer in ((lower_layers, lower_optimizer),
                                          (upper_layers, upper_optimizer)):
            gradients = tape.gradient(loss, layers.trainable_variables)
            optimizer.apply_gradients(zip(gradients, layers.trainable_variables))
        del tape

        for metric in metrics:
            metric(y_batch, y_pred)

        print_status_bar(step, n_steps, mean_loss, metrics)

    for metric in [mean_loss] + metrics:
        metric.reset_states()

Epoch 1/5
1718/1718 - mean: 0.0000 - sparse_categorical_accuracy: 0.6841
Epoch 2/5
1718/1718 - mean: 0.0000 - sparse_categorical_accuracy: 0.7804
Epoch 3/5
1718/1718 - mean: 0.0000 - sparse_categorical_accuracy: 0.7990
Epoch 4/5
1718/1718 - mean: 0.0000 - sparse_categorical_accuracy: 0.8103
Epoch 5/5
1718/1718 - mean: 0.0000 - sparse_categorical_accuracy: 0.8141
