In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from  tensorflow.keras import layers
import numpy as np


tf.random.set_seed(42)
np.random.seed(42)

model = keras.models.Sequential([
    layers.Dense(10, kernel_initializer="he_normal"),
    layers.LeakyReLU(alpha=0.2),   # leaky ReLU
    layers.PReLU(),   # PReLU
    layers.Dense(10, activation="selu", kernel_initializer="lecun_normal"),    # SELU

])

In [2]:
[name for name in dir(keras.initializers) if not name.startswith("_")]

['Constant',
 'GlorotNormal',
 'GlorotUniform',
 'HeNormal',
 'HeUniform',
 'Identity',
 'Initializer',
 'LecunNormal',
 'LecunUniform',
 'Ones',
 'Orthogonal',
 'RandomNormal',
 'RandomUniform',
 'TruncatedNormal',
 'VarianceScaling',
 'Zeros',
 'constant',
 'deserialize',
 'get',
 'glorot_normal',
 'glorot_uniform',
 'he_normal',
 'he_uniform',
 'identity',
 'lecun_normal',
 'lecun_uniform',
 'ones',
 'orthogonal',
 'random_normal',
 'random_uniform',
 'serialize',
 'truncated_normal',
 'variance_scaling',
 'zeros']

In [3]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

In [4]:
# BatchNormalization after activation
model = keras.models.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.BatchNormalization(),
    layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
    layers.BatchNormalization(),
    layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    layers.BatchNormalization(),
    layers.Dense(10, activation="softmax")
])

In [5]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 batch_normalization (BatchN  (None, 784)              3136      
 ormalization)                                                   
                                                                 
 dense_2 (Dense)             (None, 300)               235500    
                                                                 
 batch_normalization_1 (Batc  (None, 300)              1200      
 hNormalization)                                                 
                                                                 
 dense_3 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_2 (Batc  (None, 100)             

In [6]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=["accuracy"]
    )

In [7]:
history = model.fit(
    X_train, y_train, 
    epochs=10, 
    validation_split=0.05, 
    batch_size=64,
    callbacks=[keras.callbacks.EarlyStopping(patience=20)]
)

Epoch 1/10


2022-03-31 16:39:25.680207: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
# Batch Normalization before activation function 
model = keras.models.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.BatchNormalization(),
    layers.Dense(300, use_bias=False, kernel_initializer="he_normal"),
    layers.BatchNormalization(),
    layers.Activation("elu"),
    layers.Dense(100, use_bias=False, kernel_initializer="he_normal"),
    layers.BatchNormalization(),
    layers.Activation("elu"),
    layers.Dense(10, activation="softmax")
])

In [9]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=["accuracy"]
    )

In [10]:
# here works much worse

history = model.fit(
    X_train, y_train, 
    epochs=10, 
    validation_split=0.05, 
    batch_size=64,
    callbacks=[keras.callbacks.EarlyStopping(patience=20)]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
# for Gradient Clipping with threshold 1.0
optimizer = keras.optimizers.SGD(clipvalue=1.0) 
# or clipnorm, with prevents changing gradient's direction
model.compile(loss="mse", optimizer=optimizer)

In [12]:
def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A),
            (X[y_5_or_6], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

# A: all images of items except for sandals and shirts
# B: sandal or shirts, but only 200

In [13]:
keras.backend.clear_session()

In [14]:
# TRANSFER LEARNING
model_A = keras.models.Sequential()
model_A.add(layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A.add(layers.Dense(n_hidden, activation="selu"))
model_A.add(layers.Dense(8, activation="softmax"))

In [15]:
model_A.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.SGD(learning_rate=1e-3),
    metrics=["accuracy"]
)

In [16]:
history = model_A.fit(
    X_train_A, y_train_A,
    epochs=40,
    validation_split=0.05,
    callbacks=[keras.callbacks.EarlyStopping(patience=20)],
    batch_size=64,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [17]:
# binary classifier for sandal ans shirt
model_B = keras.models.Sequential()
model_B.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(keras.layers.Dense(n_hidden, activation="selu"))
model_B.add(keras.layers.Dense(1, activation="sigmoid"))

In [18]:
model_B.compile(loss="binary_crossentropy",
                optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                metrics=["accuracy"])

In [22]:
history = model_B.fit(
    X_train_B, y_train_B,
    epochs=40,
    validation_split=0.05,
    callbacks=[keras.callbacks.EarlyStopping(patience=20)],
    batch_size=64,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [23]:
model_A.evaluate(X_test_A, y_test_A)



[0.2873453199863434, 0.9108750224113464]

In [24]:
model_B.evaluate(X_test_B, y_test_B)



[0.21834918856620789, 0.953000009059906]

In [74]:
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())
model_B_on_A = keras.models.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(layers.Dense(1, activation="sigmoid"))

In [75]:
# for layer in model_B_on_A.layers[:-2]:
#     layer.trainable = False

model_B_on_A.compile(
        loss="binary_crossentropy",
        optimizer=keras.optimizers.SGD(learning_rate=1e-3),
        metrics=["accuracy"]
    )

In [76]:
history = model_B_on_A.fit(
    X_train_B, y_train_B,
    epochs=40,
    validation_split=0.05,
    callbacks=[keras.callbacks.EarlyStopping(patience=20)],
    batch_size=64,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40


In [77]:
model_B_on_A.evaluate(X_test_B, y_test_B)

# in this case it haven't helped a lot



[0.10260029137134552, 0.9639999866485596]

In [116]:
keras.backend.clear_session()

In [117]:
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means) / pixel_stds
X_test_scaled = (X_test - pixel_means) / pixel_stds

In [118]:
class OneCycleScheduler(keras.callbacks.Callback):
    def __init__(self, iterations, max_rate, start_rate=None,
                 last_iterations=None, last_rate=None):
        self.iterations = iterations
        self.max_rate = max_rate
        self.start_rate = start_rate or max_rate / 10
        self.last_iterations = last_iterations or iterations // 10 + 1
        self.half_iteration = (iterations - self.last_iterations) // 2
        self.last_rate = last_rate or self.start_rate / 1000
        self.iteration = 0
    def _interpolate(self, iter1, iter2, rate1, rate2):
        return ((rate2 - rate1) * (self.iteration - iter1)
                / (iter2 - iter1) + rate1)
    def on_batch_begin(self, batch, logs):
        if self.iteration < self.half_iteration:
            rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate)
        elif self.iteration < 2 * self.half_iteration:
            rate = self._interpolate(self.half_iteration, 2 * self.half_iteration,
                                     self.max_rate, self.start_rate)
        else:
            rate = self._interpolate(2 * self.half_iteration, self.iterations,
                                     self.start_rate, self.last_rate)
        self.iteration += 1
        keras.backend.set_value(self.model.optimizer.learning_rate, rate)

In [119]:
tf.random.set_seed(42)
np.random.seed(42)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [120]:
import math

n_epochs = 25
batch_size = 64

onecycle = OneCycleScheduler(math.ceil(len(X_train) / batch_size) * n_epochs, max_rate=0.05)
history = model.fit(
    X_train_scaled, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.1,
    callbacks=[onecycle, keras.callbacks.EarlyStopping(patience=30)]
    )

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [121]:
model.evaluate(X_test_scaled, y_test)



[0.36447378993034363, 0.8895999789237976]

In [127]:
# instead of repeting the same argumets multiple times use partia

# keras.regularizers.l1(0.01)
# keras.regularizers.l2(0.1)
# keras.regularizers.l1_l2(0.01, 0.1)

from functools import partial

RegularizedDense = partial(
    layers.Dense,
    activation="elu",
    kernel_initializer="he_normal",
    kernel_regularizer=keras.regularizers.l2(0.01)
)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    RegularizedDense(300),
    RegularizedDense(100),
    RegularizedDense(50),
    RegularizedDense(10, activation="softmax", kernel_initializer="glorot_uniform")
])

In [128]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

In [129]:
n_epochs = 25
batch_size = 64

history = model.fit(
    X_train_scaled, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.1,
    )

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [130]:
# instead of repeting the same argumets multiple times use partia

# keras.regularizers.l1(0.01)
# keras.regularizers.l2(0.1)
# keras.regularizers.l1_l2(0.01, 0.1)

from functools import partial

RegularizedDense = partial(
    layers.Dense,
    activation="elu",
    kernel_initializer="he_normal",
    kernel_regularizer=keras.regularizers.l2(0.01)
)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    layers.Dropout(rate=0.2),
    RegularizedDense(300),
    layers.Dropout(rate=0.2),
    RegularizedDense(100),
    layers.Dropout(rate=0.2),
    RegularizedDense(50),
    layers.Dropout(rate=0.2),
    RegularizedDense(10, activation="softmax", kernel_initializer="glorot_uniform")
])

In [131]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

In [132]:
n_epochs = 25
batch_size = 64

history = model.fit(
    X_train_scaled, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.1,
    )

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [133]:
model.evaluate(X_test_scaled, y_test)



[0.82342529296875, 0.808899998664856]

In [134]:
class MCDropout(keras.layers.Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

class MCAlphaDropout(keras.layers.AlphaDropout):
    def call(self, inputs):
        return super().call(inputs, training=True)

In [142]:
# replacing Dropout with MCdropout

mc_model = keras.models.Sequential([
    MCAlphaDropout(layer.rate) if isinstance(layer, keras.layers.Dropout) else layer
    for layer in model.layers
])

In [143]:
mc_model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam",
              metrics=["accuracy"])

In [144]:
mc_model.set_weights(model.get_weights())

In [146]:
# based on 100 samples - 100 different predictions
np.round(np.mean([mc_model.predict(X_test_scaled[:1]) for sample in range(100)], axis=0), 2)

array([[0.07, 0.03, 0.08, 0.03, 0.06, 0.24, 0.05, 0.11, 0.18, 0.14]],
      dtype=float32)

In [148]:
MaxNormDense = partial(
    layers.Dense,
    activation="selu",
    kernel_initializer="lecun_normal",
    kernel_constraint=keras.constraints.max_norm(1.)
)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    MaxNormDense(300),
    MaxNormDense(100),
    keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
n_epochs = 2
history = model.fit(X_train_scaled, y_train, epochs=n_epochs, validation_split=0.1)

Epoch 1/2
Epoch 2/2
