# Petals to the Metal - DL project

Names : Géraud FAYE, Lila SAINERO, Arnaud LOUYS

This notebook is inspired by the notebook provided by Kaggle to setup a TPU environment.

# Setting up a TPU environment

In [None]:
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

import tensorflow as tf
from kaggle_datasets import KaggleDatasets

print("Tensorflow version " + tf.__version__)

# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Getting Kaggle data paths

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Setting learning parameters

In [None]:
IMAGE_SIZE = [512, 512] # at this size, a GPU will run out of memory. We can use the TPU to solve this issue
EPOCHS = 200
# The batch size can be a multiple of the number of TPU cores to balance the computing load
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

# Dataset characteristics
NUM_TRAINING_IMAGES = 12753
NUM_TEST_IMAGES = 7382
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE

# Load my data

This data is loaded from Kaggle and automatically sharded to maximize parallelization.

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
#     image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
#     For InceptionV3, the pictures must have pixels values between -1 and 1
    image = (tf.cast(image, tf.float32) / 255.0) * 2.0 - 1.0
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

# Data augmentation to make generalization better
# Note that only functions in tf.keras.layers.experimental.preprocessing are supported by TPUs
augmentations = tf.keras.Sequential([
    # Flip the picture horizontally with a 50% chance
    tf.keras.layers.experimental.preprocessing.RandomFlip(mode='horizontal'),
    # Randomly change the picture contrast between [1 - factor, 1 + factor]
    tf.keras.layers.experimental.preprocessing.RandomContrast(factor=0.2),
    # Random rotation between [-factor * 2*pi, + factor * 2*pi]
    tf.keras.layers.experimental.preprocessing.RandomRotation(factor=0.5, dtype=tf.float32),
    # Randomly zoom or dezoom the picture. By default, the picture will be reflected at its ends if it is dezoomed
    tf.keras.layers.experimental.preprocessing.RandomZoom((-0.5, 0.5))
])

def data_augment(image, label):
    image = augmentations(image, training=True)
    return image, label

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord)
    # Data augmentation
#     if labeled:
#         dataset.map(data_augment, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
#     dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-192x192/train/*.tfrec'), labeled=True)
    dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512x512/train/*.tfrec'), labeled=True)
#     dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-331x331/train/*.tfrec'), labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.prefetch(AUTO)
    return dataset

def get_validation_dataset():
#     dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-192x192/val/*.tfrec'), labeled=True, ordered=False)
    dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512x512/val/*.tfrec'), labeled=True, ordered=False)
#     dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-331x331/val/*.tfrec'), labeled=True, ordered=False)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    return dataset

def get_test_dataset(ordered=False):
#     dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-192x192/test/*.tfrec'), labeled=False, ordered=ordered)
    dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512x512/test/*.tfrec'), labeled=False, ordered=ordered)
#     dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-331x331/test/*.tfrec'), labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

training_dataset = get_training_dataset()
validation_dataset = get_validation_dataset()

# Visualizing data

In [None]:
 CLASSES = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily', # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'sweet william',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose']                                                                                                                                               # 100 - 103

In [None]:
import math
import matplotlib.pyplot as plt

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    if numpy_labels.dtype == object: # binary string in this case,these are image ID strings
        numpy_labels = [None for _ in enumerate(numpy_images)]
        # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 
                                'OK' if correct else 'NO', 
                                u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)

def display_batch_of_images(databatch, predictions=None, display_mismatches_only=False):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = '' if label is None else CLASSES[label]
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        if display_mismatches_only:
            if predictions[i] != label:
                subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
        else:        
            subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

We visualize a small batch of validation pictures.

In [None]:
ds_iter = iter(validation_dataset.unbatch().batch(20))
display_batch_of_images(next(ds_iter))

And then a batch of augmented data used for training.

In [None]:
ds_iter = iter(training_dataset.unbatch().batch(20))
display_batch_of_images(next(ds_iter))

# Build and fit our model

Experiments show that we obtain best accuracy while using a model pretrained with ImageNet and adding fully-connected layers after it.
We finetune this network and do not use it only as a feature extractor, as ImageNet is not a dataset specialized in flowers.

In [None]:
# # with strategy.scope() is necessary to build the model inside the TPU
# with strategy.scope():    
#     pretrained_model = tf.keras.applications.EfficientNetB7(
#                                                     weights='imagenet',
#                                                     include_top=False ,
#                                                     input_shape=(*IMAGE_SIZE, 3),
#                                                     pooling=None,
#                                                     classes=1000)
#     pretrained_model.trainable = True # tramsfer learning
    
#     model = tf.keras.Sequential([
#         pretrained_model,
#         tf.keras.layers.GlobalAveragePooling2D(),
#         tf.keras.layers.Dense(500, activation="relu"),
#         tf.keras.layers.Dense(104, activation='softmax')
#     ])
        
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(lr=0.0001),
#     loss = 'sparse_categorical_crossentropy',
#     metrics=['sparse_categorical_accuracy']
# )

# # We use early stopping to prevent overfitting
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# historical = model.fit(training_dataset, 
#           steps_per_epoch=STEPS_PER_EPOCH, 
#           epochs=EPOCHS, 
#           validation_data=validation_dataset,
#           callbacks=[early_stopping])

# Visualizing training results

In [None]:
# import matplotlib.pyplot as plt

# plt.plot(historical.history['sparse_categorical_accuracy'])
# plt.plot(historical.history['val_sparse_categorical_accuracy'])
# plt.title('model sparse categorical accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()
# plt.plot(historical.history['loss'])
# plt.plot(historical.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

# Compute your predictions on the test set!

This will create a file that can be submitted to the competition.

In [None]:
# import numpy as np

# test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.

# print('Computing predictions...')
# test_images_ds = test_ds.map(lambda image, idnum: image)
# probabilities = model.predict(test_images_ds)
# predictions = np.argmax(probabilities, axis=-1)
# print(predictions)

# print('Generating submission.csv file...')
# test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
# test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
# np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')
# np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')

# Ensemble learning to improve accuracy

We train several models as trained above.
For each model, we compute the probabilities for each picture to be of each category.
We then sum these probabilities and choose the category with the best sum.

It is different of majority vote where each model takes a decision and the most frequent decision is adopted.
We choose the most probable category according to the ensemble of models.

In [None]:
import numpy as np

probabilities_k = []

NUMBER_OF_MODELS = 1
EPOCHS = 20
for k in range(NUMBER_OF_MODELS):
    with strategy.scope():    
        pretrained_model = tf.keras.applications.Xception(
                                                        weights='imagenet',
                                                        include_top=False ,
                                                        input_shape=(*IMAGE_SIZE, 3),
                                                        pooling=None,
                                                        classes=1000)
        pretrained_model.trainable = True # transfer learning
        model = tf.keras.Sequential([
            pretrained_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(500, activation="relu"),
            tf.keras.layers.Dense(104, activation='softmax')
        ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.0001),
        loss = 'sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy']
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    historical = model.fit(training_dataset, 
              steps_per_epoch=STEPS_PER_EPOCH, 
              epochs=EPOCHS, 
              validation_data=validation_dataset,
              callbacks=[early_stopping])
    test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.
    print(f'Computing predictions for model {k}')
    test_images_ds = test_ds.map(lambda image, idnum: image)
    probabilities_k.append(model.predict(test_images_ds))
    
    with strategy.scope():    
        pretrained_model = tf.keras.applications.DenseNet201(
                                                        weights='imagenet',
                                                        include_top=False ,
                                                        input_shape=(*IMAGE_SIZE, 3),
                                                        pooling=None,
                                                        classes=1000)
        pretrained_model.trainable = True # transfer learning
        model = tf.keras.Sequential([
            pretrained_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(500, activation="relu"),
            tf.keras.layers.Dense(104, activation='softmax')
        ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.0001),
        loss = 'sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy']
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    historical = model.fit(training_dataset, 
              steps_per_epoch=STEPS_PER_EPOCH, 
              epochs=EPOCHS, 
              validation_data=validation_dataset,
              callbacks=[early_stopping])
    test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.
    print(f'Computing predictions for model {k}')
    test_images_ds = test_ds.map(lambda image, idnum: image)
    probabilities_k.append(model.predict(test_images_ds))
    
    with strategy.scope():    
        pretrained_model = tf.keras.applications.InceptionResNetV2(
                                                        weights='imagenet',
                                                        include_top=False ,
                                                        input_shape=(*IMAGE_SIZE, 3),
                                                        pooling=None,
                                                        classes=1000)
        pretrained_model.trainable = True # transfer learning
        model = tf.keras.Sequential([
            pretrained_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(500, activation="relu"),
            tf.keras.layers.Dense(104, activation='softmax')
        ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.0001),
        loss = 'sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy']
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    historical = model.fit(training_dataset, 
              steps_per_epoch=STEPS_PER_EPOCH, 
              epochs=EPOCHS, 
              validation_data=validation_dataset,
              callbacks=[early_stopping])
    test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.
    print(f'Computing predictions for model {k}')
    test_images_ds = test_ds.map(lambda image, idnum: image)
    probabilities_k.append(model.predict(test_images_ds))
    
    with strategy.scope():    
        pretrained_model = tf.keras.applications.EfficientNetB7(
                                                        weights='imagenet',
                                                        include_top=False ,
                                                        input_shape=(*IMAGE_SIZE, 3),
                                                        pooling=None,
                                                        classes=1000)
        pretrained_model.trainable = True # transfer learning
        model = tf.keras.Sequential([
            pretrained_model,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(500, activation="relu"),
            tf.keras.layers.Dense(104, activation='softmax')
        ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.0001),
        loss = 'sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy']
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    historical = model.fit(training_dataset, 
              steps_per_epoch=STEPS_PER_EPOCH, 
              epochs=EPOCHS, 
              validation_data=validation_dataset,
              callbacks=[early_stopping])
    test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.
    print(f'Computing predictions for model {k}')
    test_images_ds = test_ds.map(lambda image, idnum: image)
    probabilities_k.append(model.predict(test_images_ds))
    
probabilities = np.zeros_like(probabilities_k[0])
for prob in probabilities_k:
    probabilities += prob
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
# np.savetxt('submission_ensemble.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')
np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')