In [1]:
import tensorflow as tf
import numpy as np
import cv2
from tensorflow.keras.callbacks import (
    ReduceLROnPlateau,
    EarlyStopping,
    ModelCheckpoint,
    TensorBoard
)
import datetime, os

In [2]:
%run 'yolov3_tf2.ipynb'

In [3]:
%run 'utils.ipynb'

In [4]:
dataset = './data/audi_train.tfrecord' #'path to dataset'
val_dataset = './data/audi_val.tfrecord' # 'path to validation dataset'
weights = './checkpoints/yolov3.tf' #'path to weights file'
classes = './data/classes.txt' #'path to classes file'
mode = 'fit' #['fit', 'eager_fit', 'eager_tf']
             #'fit: model.fit, '
             #'eager_fit: model.fit(run_eagerly=True), '
             #'eager_tf: custom GradientTape'
transfer = 'darknet'
             #['none', 'darknet', 'no_output', 'frozen', 'fine_tune'],
             #'none: Training from scratch, '
             #'darknet: Transfer darknet, '
             #'no_output: Transfer all but output, '
             #'frozen: Transfer and freeze all, '
             #'fine_tune: Transfer all and freeze darknet only'
size = 416 #'image size'
epochs = 100 #'number of epochs'
batch_size = 64 #'batch size'
learning_rate = 1e-3 #'learning rate'
num_classes = 14 #'number of classes in the model'
weights_num_classes = 80 #'specify num class for `weights` file if different, '
                         #'useful in transfer learning with different number of classes'


def train_model():  
    model = YoloV3(size, training=True, classes=num_classes)
    anchors = yolo_anchors
    anchor_masks = yolo_anchor_masks

    train_dataset = load_fake_dataset()
    if dataset:
        train_dataset = load_tfrecord_dataset(dataset, classes, size)
    train_dataset = train_dataset.shuffle(buffer_size=512)
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.map(lambda x, y: (
        transform_images(x, size),
        transform_targets(y, anchors, anchor_masks, size)))
    train_dataset = train_dataset.prefetch(
        buffer_size=tf.data.experimental.AUTOTUNE)
    
    v_dataset = load_fake_dataset()
    if val_dataset:
        v_dataset = load_tfrecord_dataset(val_dataset, classes, size)
    v_dataset = v_dataset.batch(batch_size)
    v_dataset = v_dataset.map(lambda x, y: (
        transform_images(x, size),
        transform_targets(y, anchors, anchor_masks, size)))
    
    # Configure the model for transfer learning
    if transfer == 'none':
        pass  # Nothing to do
    elif transfer in ['darknet', 'no_output']:
        # Darknet transfer is a special case that works
        # with incompatible number of classes

        # reset top layers
        model_pretrained = YoloV3(
            size, training=True, classes=weights_num_classes or num_classes)
        model_pretrained.load_weights(weights)

        if transfer == 'darknet':
            model.get_layer('yolo_darknet').set_weights(
                model_pretrained.get_layer('yolo_darknet').get_weights())
            freeze_all(model.get_layer('yolo_darknet'))

        elif transfer == 'no_output':
            for l in model.layers:
                if not l.name.startswith('yolo_output'):
                    l.set_weights(model_pretrained.get_layer(
                        l.name).get_weights())
                    freeze_all(l)

    else:
        # All other transfer require matching classes
        model.load_weights(weights)
        if transfer == 'fine_tune':
            # freeze darknet and fine tune other layers
            darknet = model.get_layer('yolo_darknet')
            freeze_all(darknet)
        elif transfer == 'frozen':
            # freeze everything
            freeze_all(model)

    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    loss = [YoloLoss(anchors[mask], classes=num_classes)
            for mask in anchor_masks]

    if mode == 'eager_tf':
        # Eager mode is great for debugging
        # Non eager graph mode is recommended for real training
        avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
        avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32)

        for epoch in range(1, epochs + 1):
            for batch, (images, labels) in enumerate(train_dataset):
                with tf.GradientTape() as tape:
                    outputs = model(images, training=True)
                    regularization_loss = tf.reduce_sum(model.losses)
                    pred_loss = []
                    for output, label, loss_fn in zip(outputs, labels, loss):
                        pred_loss.append(loss_fn(label, output))
                    total_loss = tf.reduce_sum(pred_loss) + regularization_loss

                grads = tape.gradient(total_loss, model.trainable_variables)
                optimizer.apply_gradients(
                    zip(grads, model.trainable_variables))

                logging.info("{}_train_{}, {}, {}".format(
                    epoch, batch, total_loss.numpy(),
                    list(map(lambda x: np.sum(x.numpy()), pred_loss))))
                avg_loss.update_state(total_loss)

            for batch, (images, labels) in enumerate(v_dataset):
                outputs = model(images)
                regularization_loss = tf.reduce_sum(model.losses)
                pred_loss = []
                for output, label, loss_fn in zip(outputs, labels, loss):
                    pred_loss.append(loss_fn(label, output))
                total_loss = tf.reduce_sum(pred_loss) + regularization_loss

                logging.info("{}_val_{}, {}, {}".format(
                    epoch, batch, total_loss.numpy(),
                    list(map(lambda x: np.sum(x.numpy()), pred_loss))))
                avg_val_loss.update_state(total_loss)

            logging.info("{}, train: {}, val: {}".format(
                epoch,
                avg_loss.result().numpy(),
                avg_val_loss.result().numpy()))

            avg_loss.reset_states()
            avg_val_loss.reset_states()
            model.save_weights(
                'checkpoints/yolov3_train_{}.tf'.format(epoch))
    else:
        model.compile(optimizer=optimizer, loss=loss,
                      run_eagerly=(mode == 'eager_fit'))

        callbacks = [
            ReduceLROnPlateau(monitor='loss',verbose=1),
            EarlyStopping(patience=3,monitor='loss', verbose=1),
            ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf',
                            verbose=1, save_weights_only=True),
            TensorBoard(log_dir='logs')
        ]

        history = model.fit(train_dataset,
                            epochs=epochs,
                            callbacks=callbacks,
                            validation_data=v_dataset)

In [5]:
logs_base_dir = "./logs"
os.makedirs(logs_base_dir, exist_ok=True)
%load_ext tensorboard
%tensorboard --logdir {logs_base_dir}

In [6]:
train_model()

Epoch 1/100
    129/Unknown - 8213s 64s/step - loss: 822.9018 - yolo_output_0_loss: 42.1515 - yolo_output_1_loss: 146.7310 - yolo_output_2_loss: 623.0392
Epoch 00001: saving model to checkpoints/yolov3_train_1.tf
Epoch 2/100
Epoch 00002: saving model to checkpoints/yolov3_train_2.tf
Epoch 3/100
Epoch 00003: saving model to checkpoints/yolov3_train_3.tf


NotFoundError:  ./data/audi_val.tfrecord; No such file or directory
	 [[node IteratorGetNext (defined at <ipython-input-4-c37fbf49a2c3>:150) ]] [Op:__inference_distributed_function_52279]

Function call stack:
distributed_function
