In [1]:
from glob import glob
import cv2
import numpy as np
import xml.etree.ElementTree as ET
import keras
from keras import backend as K
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, GlobalAveragePooling2D
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers
from keras.callbacks import LearningRateScheduler
import matplotlib.pyplot as plt
import os
import tensorflow.keras.applications as app

batch_size = 64
n_epochs = 12
per_sample_normalization = True
data_augmentation = False
net_name = [['resnet50','ResNet50'], ['inception_v3','InceptionV3'], ['mobilenet_v2','MobileNetV2']][0]
train_from_scratch = True
last_layer_activation = ['softmax', 'sigmoid', None][1]
loss = ['categorical_crossentropy', 'binary_crossentropy', 'mean_squared_error', 'mean_absolute_error'][1]
img_size = 224
num_classes = 20
voc_classes = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}
test_imagenet = False


In [None]:
# Download and untar voc dataset
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!tar xf VOCtrainval_06-Nov-2007.tar
print('VOCtrainval_06-Nov-2007.tar has been uncompressed successfully.')

In [None]:
# Read and format data
def read_content(xml_file: str):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    list_with_all_boxes = []
    list_with_all_objects = []
    for boxes in root.iter('object'):

        classname = boxes.find("name").text
        list_with_all_objects.append(voc_classes[classname])

        ymin, xmin, ymax, xmax = None, None, None, None

        ymin = int(boxes.find("bndbox/ymin").text)
        xmin = int(boxes.find("bndbox/xmin").text)
        ymax = int(boxes.find("bndbox/ymax").text)
        xmax = int(boxes.find("bndbox/xmax").text)

        list_with_single_boxes = [xmin, ymin, xmax, ymax]
        list_with_all_boxes.append(list_with_single_boxes)

    return list_with_all_objects, list_with_all_boxes


files = glob('VOCdevkit/VOC2007/JPEGImages/*.jpg')

n_samples = len(files)
files = files[:n_samples]
x_train, y_train, x_test, y_test = [], [], [], []

np.random.seed(0)
ridx = np.random.randint(0, n_samples, int(n_samples*0.2))
train_test_split = np.zeros(n_samples)
train_test_split[ridx] = 1
for f, i in zip(files, range(n_samples)):
    img = cv2.imread(f)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
    img = cv2.resize(img, (img_size, img_size))
    
    if train_test_split[i]:
        x_test.append(img)
    else:
        x_train.append(img)
    
    classes = np.zeros(num_classes)
    root, name = f.split('JPEGImages', 1)
    cnames, _ = read_content(root+'Annotations'+name[:-3]+'xml')
    for c in cnames:
        classes[c] = 1.0
            
    if train_test_split[i]:
        y_test.append(classes)
    else:
        y_train.append(classes)
    
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

In [None]:
# Test ResNet50 pretrained on imagenet on the voc samples

if test_imagenet:
  from tensorflow.keras.applications.resnet50 import ResNet50
  from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
  from copy import copy

  model = ResNet50(weights='imagenet')

  for x in x_test[:5]:
    plt.figure()
    plt.imshow(x.astype('uint8'))
    plt.show()

    x = x[np.newaxis, :, :, :]
    px = preprocess_input(copy(x))
    
    preds = model.predict(px)
    
    # decode the results into a list of tuples (class, description, probability)
    # (one such list for each sample in the batch)
    print('Predicted:', decode_predictions(preds, top=3)[0])

In [None]:
# Build the model

# Select the corresponding network class
mynet = getattr(getattr(app, net_name[0]), net_name[1])

# create the base pre-trained model
if train_from_scratch:
  base_model = mynet(include_top=False)
else:
  base_model = mynet(weights='imagenet', include_top=False)

# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer
predictions = Dense(num_classes, activation=last_layer_activation)(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

In [None]:
#training

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_metric(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

#data augmentation
val_data_gen_args = dict(rescale = None if per_sample_normalization else 1./255,
                     samplewise_center=True if per_sample_normalization else False,
                     samplewise_std_normalization=True if per_sample_normalization else False)
train_data_gen_args = dict(rescale = None if per_sample_normalization else 1./255,
                     samplewise_center=True if per_sample_normalization else False,
                     samplewise_std_normalization=True if per_sample_normalization else False,
                     rotation_range=20,
                     width_shift_range=0.1,
                     height_shift_range=0.1,
                     zoom_range=0.2) if data_augmentation else val_data_gen_args
training_datagen = ImageDataGenerator(train_data_gen_args)
training_set = training_datagen.flow(x_train, y_train, batch_size=batch_size)
val_datagen = ImageDataGenerator(val_data_gen_args)
val_set = val_datagen.flow(x_test, y_test, batch_size=batch_size)


if train_from_scratch:
  opt_rms = optimizers.RMSprop(learning_rate=0.001, decay=1e-6)
  model.compile(loss=loss, optimizer=opt_rms, metrics=['AUC', f1_metric])
  mdl_fit = model.fit_generator(training_set, steps_per_epoch=x_train.shape[0] // batch_size, 
                      epochs=n_epochs, verbose=1, validation_data=val_set)
else:
  # first: train only the top layers (which were randomly initialized)
  # i.e. freeze all convolutional layers
  for layer in base_model.layers:
      layer.trainable = False

  # compile the model (should be done *after* setting layers to non-trainable)
  opt_rms = optimizers.RMSprop(learning_rate=0.001, decay=1e-6)
  model.compile(loss=loss, optimizer=opt_rms, metrics=['AUC', f1_metric])

  # train the model on the new data for a few epochs
  mdl_fit = model.fit_generator(training_set, steps_per_epoch=x_train.shape[0] // batch_size, 
                      epochs=5, verbose=1, validation_data=val_set)

  # at this point, the top layers are well trained and we can start fine-tuning
  # convolutional layers. We will freeze the bottom N layers
  # and train the remaining top layers.

  # let's visualize layer names and layer indices to see how many layers
  # we should freeze:
  #for i, layer in enumerate(base_model.layers):
  #   print(i, layer.name)

  # we unfreeze the layers:
  for layer in model.layers:
    layer.trainable = True

  # we need to recompile the model for these modifications to take effect
  # we use SGD with a low learning rate
  opt_rms = optimizers.SGD(learning_rate=0.0001, momentum=0.9)
  model.compile(loss=loss, optimizer=opt_rms, metrics=['AUC', f1_metric])

  # we train our model again (this time fine-tuning the top 2 inception blocks
  # alongside the top Dense layers
  mdl_fit2 = model.fit_generator(training_set, steps_per_epoch=x_train.shape[0] // batch_size, 
                      epochs=n_epochs, verbose=1, validation_data=val_set)