In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import io
from PIL import Image

from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler, ModelCheckpoint, CSVLogger, ReduceLROnPlateau
%load_ext tensorboard

import os
import matplotlib.pylab as plt
import numpy as np
import math
import datetime
import pandas as pd

print("Version: ", tf.__version__)
tf.get_logger().setLevel('INFO')

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
Version:  2.8.2


## Download and prepare the horses or humans dataset


In [2]:
ds, info = tfds.load('horses_or_humans', as_supervised=True, with_info=True, split=['train[:80%]', 'train[80%:]', 'test'])

(train_ds, validation_ds, test_ds) = ds

num_examples = info.splits['train'].num_examples
num_classes = info.features['label'].num_classes

[1mDownloading and preparing dataset 153.59 MiB (download: 153.59 MiB, generated: Unknown size, total: 153.59 MiB) to ~/tensorflow_datasets/horses_or_humans/3.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1027 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/horses_or_humans/3.0.0.incomplete91JX64/horses_or_humans-train.tfrecord*...:  …

Generating test examples...:   0%|          | 0/256 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/horses_or_humans/3.0.0.incomplete91JX64/horses_or_humans-test.tfrecord*...:   …

[1mDataset horses_or_humans downloaded and prepared to ~/tensorflow_datasets/horses_or_humans/3.0.0. Subsequent calls will reuse this data.[0m


In [3]:
print(num_examples)
print(num_classes)

1027
2


In [4]:
IMAGE_SIZE = (150, 150)

In [5]:
def preprocess_image(image, label):
  image = tf.image.resize(image, IMAGE_SIZE) / 255.0
  return  image, label

In [6]:
BATCH_SIZE = 32
train_batches = train_ds.shuffle(num_examples // 4).map(preprocess_image).batch(BATCH_SIZE).prefetch(1)
validation_batches = validation_ds.map(preprocess_image).batch(BATCH_SIZE).prefetch(1)
test_batches = test_ds.map(preprocess_image).batch(1)

In [7]:
image_batch, label_batch = next(iter(train_batches.take(1)))


image_batch.shape, label_batch.shape

(TensorShape([32, 150, 150, 3]), TensorShape([32]))

## Create Model

In [8]:
def build_model(dense_units, input_shape=IMAGE_SIZE + (3,)):
  model = tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=input_shape),
      tf.keras.layers.MaxPooling2D(2, 2),
      tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
      tf.keras.layers.MaxPooling2D(2, 2),
      tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
      tf.keras.layers.MaxPooling2D(2, 2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(dense_units, activation='relu'),
      tf.keras.layers.Dense(2, activation='softmax')
  ])
  return model

## TensorBoard Callback

In [9]:
!rm -rf logs

In [10]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir,
                                                      #histogram_freq=1,
                                                      #update_freq=1,
                                                      )

model.fit(train_batches, 
          epochs=10, 
          validation_data=validation_batches, 
          callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


KeyboardInterrupt: ignored

In [None]:
%tensorboard --logdir logs

## Saving Checkpoints

In [None]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
model.fit(train_batches, 
          epochs=5, 
          validation_data=validation_batches, 
          verbose=2,
          callbacks=[ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.h5', verbose=1),
          ])

In [None]:
!ls .

In [None]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
model.fit(train_batches, 
          epochs=1, 
          validation_data=validation_batches, 
          verbose=2,
          callbacks=[ModelCheckpoint('saved_model', verbose=1)
          ])

In [None]:
 ! ls .

In [None]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
model.fit(train_batches, 
          epochs=10, 
          validation_data=validation_batches, 
          verbose=2,
          callbacks=[ModelCheckpoint('model.h5', save_weights_only=True, save_best_only=True, monitor="val_loss", mode="min", verbose=1)
          ])

## Early Stopping

In [None]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
model.fit(train_batches, 
          epochs=50, 
          validation_data=validation_batches, 
          verbose=2,
          callbacks=[EarlyStopping(
              patience=3,
              min_delta=0.05,
              baseline=0.8,
              mode='min',
              monitor='val_loss',
              restore_best_weights=True,
              verbose=1)
          ])

## Learning Rate Scheduler

In [11]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
def step_decay(epoch):
	initial_lr = 0.01
	drop = 0.5
	epochs_drop = 1
	lr = initial_lr * math.pow(drop, math.floor((1+epoch)/epochs_drop))
	return lr

model.fit(train_batches, 
          epochs=5, 
          validation_data=validation_batches, 
          callbacks=[LearningRateScheduler(step_decay, verbose=1),
                    TensorBoard(log_dir='./log_dir')])


Epoch 1: LearningRateScheduler setting learning rate to 0.005.
Epoch 1/5

Epoch 2: LearningRateScheduler setting learning rate to 0.0025.
Epoch 2/5

Epoch 3: LearningRateScheduler setting learning rate to 0.00125.
Epoch 3/5

Epoch 4: LearningRateScheduler setting learning rate to 0.000625.
Epoch 4/5

Epoch 5: LearningRateScheduler setting learning rate to 0.0003125.
Epoch 5/5


<keras.callbacks.History at 0x7f59a3a55610>

In [12]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
  
def exp_decay_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

model.fit(train_batches, 
          epochs=15, 
          validation_data=validation_batches, 
          callbacks=[LearningRateScheduler(exp_decay_scheduler, verbose=1),
                    TensorBoard(log_dir='./log_dir')])


Epoch 1: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 1/15

Epoch 2: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 2/15

Epoch 3: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 3/15

Epoch 4: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 4/15

Epoch 5: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 5/15

Epoch 6: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 6/15

Epoch 7: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 7/15

Epoch 8: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 8/15

Epoch 9: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 9/15

Epoch 10: LearningRateScheduler setting learning rate to 0.009999999776482582.
Epoch 10/15

Epoch 11: LearningRateScheduler setting learning rate to 0.009048374369740486.
Epoch 11/15

Epoch

<keras.callbacks.History at 0x7f59a1605190>

## Custom Callbacks

Now, define a simple custom callback that logs:

- When `fit`/`evaluate`/`predict` starts & ends
- When each epoch starts & ends
- When each training batch starts & ends
- When each evaluation (test) batch starts & ends
- When each inference (prediction) batch starts & ends

In [13]:

class CustomCallback(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        keys = list(logs.keys())
        print("Starting training; got log keys: {}".format(keys))

    def on_train_end(self, logs=None):
        keys = list(logs.keys())
        print("Stop training; got log keys: {}".format(keys))

    def on_epoch_begin(self, epoch, logs=None):
        keys = list(logs.keys())
        print("Start epoch {} of training; got log keys: {}".format(epoch, keys))

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        print("End epoch {} of training; got log keys: {}".format(epoch, keys))

    def on_test_begin(self, logs=None):
        keys = list(logs.keys())
        print("Start testing; got log keys: {}".format(keys))

    def on_test_end(self, logs=None):
        keys = list(logs.keys())
        print("Stop testing; got log keys: {}".format(keys))

    def on_predict_begin(self, logs=None):
        keys = list(logs.keys())
        print("Start predicting; got log keys: {}".format(keys))

    def on_predict_end(self, logs=None):
        keys = list(logs.keys())
        print("Stop predicting; got log keys: {}".format(keys))

    def on_train_batch_begin(self, batch, logs=None):
        keys = list(logs.keys())
        print("...Training: start of batch {}; got log keys: {}".format(batch, keys))

    def on_train_batch_end(self, batch, logs=None):
        keys = list(logs.keys())
        print("...Training: end of batch {}; got log keys: {}".format(batch, keys))

    def on_test_batch_begin(self, batch, logs=None):
        keys = list(logs.keys())
        print("...Evaluating: start of batch {}; got log keys: {}".format(batch, keys))

    def on_test_batch_end(self, batch, logs=None):
        keys = list(logs.keys())
        print("...Evaluating: end of batch {}; got log keys: {}".format(batch, keys))

    def on_predict_batch_begin(self, batch, logs=None):
        keys = list(logs.keys())
        print("...Predicting: start of batch {}; got log keys: {}".format(batch, keys))

    def on_predict_batch_end(self, batch, logs=None):
        keys = list(logs.keys())
        print("...Predicting: end of batch {}; got log keys: {}".format(batch, keys))


In [14]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
model.fit(train_batches, 
          epochs=2, 
          validation_data=validation_batches, 
          verbose=0,
          callbacks=[CustomCallback()])

Starting training; got log keys: []
Start epoch 0 of training; got log keys: []
...Training: start of batch 0; got log keys: []
...Training: end of batch 0; got log keys: ['loss', 'accuracy']
...Training: start of batch 1; got log keys: []
...Training: end of batch 1; got log keys: ['loss', 'accuracy']
...Training: start of batch 2; got log keys: []
...Training: end of batch 2; got log keys: ['loss', 'accuracy']
...Training: start of batch 3; got log keys: []
...Training: end of batch 3; got log keys: ['loss', 'accuracy']
...Training: start of batch 4; got log keys: []
...Training: end of batch 4; got log keys: ['loss', 'accuracy']
...Training: start of batch 5; got log keys: []
...Training: end of batch 5; got log keys: ['loss', 'accuracy']
...Training: start of batch 6; got log keys: []
...Training: end of batch 6; got log keys: ['loss', 'accuracy']
...Training: start of batch 7; got log keys: []
...Training: end of batch 7; got log keys: ['loss', 'accuracy']
...Training: start of ba

<keras.callbacks.History at 0x7f59a165c250>

In [15]:
res = model.evaluate(
    test_batches, verbose=0, callbacks=[CustomCallback()]
)

res = model.predict(test_batches, batch_size=128, callbacks=[CustomCallback()])

Start testing; got log keys: []
...Evaluating: start of batch 0; got log keys: []
...Evaluating: end of batch 0; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 1; got log keys: []
...Evaluating: end of batch 1; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 2; got log keys: []
...Evaluating: end of batch 2; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 3; got log keys: []
...Evaluating: end of batch 3; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 4; got log keys: []
...Evaluating: end of batch 4; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 5; got log keys: []
...Evaluating: end of batch 5; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 6; got log keys: []
...Evaluating: end of batch 6; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 7; got log keys: []
...Evaluating: end of batch 7; got log keys: ['loss', 'accuracy']
...Evaluating: start of batch 8; got log

### Usage of `logs` dict
The `logs` dict contains the loss value, and all the metrics at the end of a batch or
epoch. Example includes the loss and mean absolute error.

In [16]:
class LossAndErrorPrintingCallback(tf.keras.callbacks.Callback):
    def on_train_batch_end(self, batch, logs=None):
        print(
            "Up to batch {}, the average loss is {:7.2f}.".format(batch, logs["loss"])
        )

    def on_test_batch_end(self, batch, logs=None):
        print(
            "Up to batch {}, the average loss is {:7.2f}.".format(batch, logs["loss"])
        )

    def on_epoch_end(self, epoch, logs=None):
        print(
            "The average loss for epoch {} is {:7.2f} "
            "and accuracy is {:7.2f}.".format(
                epoch, logs["loss"], logs["accuracy"]
            )
        )

In [17]:
model.fit(
    train_batches,
    epochs=2,
    verbose=0,
    callbacks=[LossAndErrorPrintingCallback()],
)

Up to batch 0, the average loss is    0.68.
Up to batch 1, the average loss is    0.65.
Up to batch 2, the average loss is    0.64.
Up to batch 3, the average loss is    0.63.
Up to batch 4, the average loss is    0.63.
Up to batch 5, the average loss is    0.62.
Up to batch 6, the average loss is    0.62.
Up to batch 7, the average loss is    0.62.
Up to batch 8, the average loss is    0.61.
Up to batch 9, the average loss is    0.61.
Up to batch 10, the average loss is    0.61.
Up to batch 11, the average loss is    0.61.
Up to batch 12, the average loss is    0.61.
Up to batch 13, the average loss is    0.61.
Up to batch 14, the average loss is    0.61.
Up to batch 15, the average loss is    0.60.
Up to batch 16, the average loss is    0.60.
Up to batch 17, the average loss is    0.60.
Up to batch 18, the average loss is    0.60.
Up to batch 19, the average loss is    0.60.
Up to batch 20, the average loss is    0.60.
Up to batch 21, the average loss is    0.60.
Up to batch 22, the 

<keras.callbacks.History at 0x7f59a325b750>

In [18]:
res = model.evaluate(
    test_batches,
    verbose=0,
    callbacks=[LossAndErrorPrintingCallback()],
)

Up to batch 0, the average loss is    1.32.
Up to batch 1, the average loss is    1.05.
Up to batch 2, the average loss is    1.14.
Up to batch 3, the average loss is    0.90.
Up to batch 4, the average loss is    0.76.
Up to batch 5, the average loss is    0.85.
Up to batch 6, the average loss is    0.86.
Up to batch 7, the average loss is    0.78.
Up to batch 8, the average loss is    0.83.
Up to batch 9, the average loss is    0.77.
Up to batch 10, the average loss is    0.77.
Up to batch 11, the average loss is    0.72.
Up to batch 12, the average loss is    0.68.
Up to batch 13, the average loss is    0.68.
Up to batch 14, the average loss is    0.65.
Up to batch 15, the average loss is    0.70.
Up to batch 16, the average loss is    0.72.
Up to batch 17, the average loss is    0.69.
Up to batch 18, the average loss is    0.72.
Up to batch 19, the average loss is    0.69.
Up to batch 20, the average loss is    0.71.
Up to batch 21, the average loss is    0.69.
Up to batch 22, the 

## Usage of `self.model` attribute

In addition to receiving log information when one of their methods is called,
callbacks have access to the model associated with the current round of
training/evaluation/inference: `self.model`.

Here are a few of the things you can do with `self.model` in a callback:

- Set `self.model.stop_training = True` to immediately interrupt training.
- Mutate hyperparameters of the optimizer (available as `self.model.optimizer`),
such as `self.model.optimizer.learning_rate`.
- Save the model at period intervals.
- Record the output of `model.predict()` on a few test samples at the end of each
epoch, to use as a sanity check during training.
- Extract visualizations of intermediate features at the end of each epoch, to monitor
what the model is learning over time.
- etc.

Let's see this in action in a couple of examples.

### A Callback for Detecting Overfitting
Let's explore a call back where we measure the ratio between our validation loss and our training loss. If the ratio gets too high, we could have an over-fitting scenario because the validation loss may no longer be decreasing while the training loss continues to decrease, making the ratio of validation loss divided by training loss higher. We should in this case, stop training to avoid overfitting.

In [19]:
class DetectOverfittingCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold=0.7):
        super(DetectOverfittingCallback, self).__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        ratio = logs["val_loss"] / logs["loss"]
        print("Epoch: {}, Val/Train loss ratio: {:.2f}".format(epoch, ratio))

        if ratio > self.threshold:
            print("Stopping training...")
            self.model.stop_training = True

In [20]:
model = build_model(dense_units=256)
model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy'])
model.fit(train_batches, 
          epochs=10, 
          validation_data=validation_batches, 
          verbose=0,
          callbacks=[DetectOverfittingCallback()])

Epoch: 0, Val/Train loss ratio: 0.98
Stopping training...


<keras.callbacks.History at 0x7f59a2cce610>

See [this deeplearning.ai](https://www.coursera.org/learn/custom-models-layers-loss-functions-with-tensorflow) course for a great example on viusalizing results at the end of each epoch