# Idea

Use keras resenet encoder, then custom decoder. Group together to form siamese triplets, with a stack of the three decoded features. Then minimize the distance between the three.

In [None]:
!pip install --upgrade tensorflow-datasets
!pip install -U tensorboard_plugin_profile

import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorboard_plugin_profile

# check if we have available GPU
if tf.config.list_physical_devices('GPU'):
  print("HAS GPU")

Mount the google drive with uploaded input data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path_to_folder = "/content/drive/MyDrive/IML_Task3/"
path_to_food = "/content/drive/MyDrive/IML_Task3/food/"

On a local runtime, run the following instead:

In [None]:
path_to_folder = "./"
path_to_food = "./food/"

Loading from google drive
Upload the txt files via the upload dialog to the left, the images take too long thats why we do the below steps:

https://www.youtube.com/watch?v=Mq8-WdcnzVo <-- source

--> Since accessing rom google drive is quite slow, instead download the zip folder and unzip it, 
- add the zip folder to your google drive, 
- left click on it --> "Link abrufen" --> freigeben für alle, link kopieren: 
- then just copy the id of the link, which is between the d/ and /view, 

for example: ...gle.com/file/d/**1RNc879PiOQaVDLhMmygpGA4rfscFrmWa**/view?usp=sha...

insert this id for the drive_id below:

In [None]:
! gdown --id 1LsF0_SND4REqTZWpxCZBAD9SInvFD3A5 # food.zip 
! unzip /content/food.zip -d /content/
path_to_food = "/content/food/"

# Data Loading (JONNY)
First create a dataset containing only the number triplets

In [65]:
# resnet input dimensions 224 x 224 x 3
# efficient net input size 256
IMG_HEIGHT = 256
IMG_WIDTH = 256
TRAIN_DATASET_SIZE = 0  # gets overwritten in load_dataset
VAL_DATASET_SIZE = 0    # gets overwritten in load_dataset
TEST_DATASET_SIZE = 0   # gets overwritten in load_dataset
BATCH_SIZE = 64
TEST_BATCH_SIZE = 128
AUTOTUNE = tf.data.AUTOTUNE
TRAIN_VALID_SPLIT = 0.8

In [66]:
# TODO move these paths down
path_to_train = path_to_folder + "train_triplets.txt"
path_to_test = path_to_folder + "test_triplets.txt"

def load_image(img):
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32)
    img = tf.image.resize(img, (IMG_HEIGHT, IMG_WIDTH))
    return img

def get_image_triplet(num_triplet, is_train):
  print(path_to_food)
  imA = load_image(tf.io.read_file(path_to_food + num_triplet[0] + '.jpg'))
  imB = load_image(tf.io.read_file(path_to_food + num_triplet[1] + '.jpg'))
  imC = load_image(tf.io.read_file(path_to_food + num_triplet[2] + '.jpg'))
  # in training append label = 1
  if is_train:
    return tf.stack([imA, imB, imC], axis=0), 1
  else:
    return tf.stack([imA, imB, imC], axis=0)

def load_dataset(path, is_train):
  data_array = np.loadtxt(path, dtype=str)
  # data_array = data_array[0:1000] <--- can be used to test with small amount of data
  dataset = tf.data.Dataset.from_tensor_slices(data_array) # num1 num2 num3 as string

  if is_train:
    global TRAIN_DATASET_SIZE 
    global VAL_DATASET_SIZE
    TRAIN_DATASET_SIZE = (TRAIN_VALID_SPLIT * dataset.cardinality().numpy())
    VAL_DATASET_SIZE = dataset.cardinality().numpy() - TRAIN_DATASET_SIZE

    train_dataset = dataset.take(TRAIN_DATASET_SIZE)\
                            .shuffle(BATCH_SIZE, reshuffle_each_iteration=True)\
                            .repeat()
    
    val_dataset = dataset.skip(TRAIN_DATASET_SIZE).repeat()
    
    train_dataset = train_dataset.map(lambda num_triplet: get_image_triplet(num_triplet, is_train),
      num_parallel_calls=AUTOTUNE)
    val_dataset = val_dataset.map(lambda num_triplet: get_image_triplet(num_triplet, is_train),
      num_parallel_calls=AUTOTUNE)
    
    return train_dataset, val_dataset

  else:
    global TEST_DATASET_SIZE
    TEST_DATASET_SIZE = dataset.cardinality().numpy()
    # apply transformation to images
    dataset = dataset.map(lambda num_triplet: get_image_triplet(num_triplet, is_train),
        num_parallel_calls=AUTOTUNE)
    return dataset

## Loading the datasets
Load the train, validation and test datasets, with a split of 80%/20% for the validation

In [None]:
train_dataset, val_dataset = load_dataset(path_to_train, is_train=True)

train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

test_dataset = load_dataset(path_to_test, is_train=False).batch(TEST_BATCH_SIZE)

# benchmark how long a loading a batch takes
tfds.benchmark(train_dataset, batch_size=BATCH_SIZE, num_iter=10)

VAL_DATASET_SIZE = np.ceil(TRAIN_DATASET_SIZE*(1.0-TRAIN_VALID_SPLIT))
TRAIN_DATASET_SIZE -= VAL_DATASET_SIZE

for batch in train_dataset.take(1):
  plt.figure(figsize=(10, 10))
  images = batch[0][0]
  for i in range(3):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i] / 255)
    plt.axis('off')

Use buffered prefetching to remove I/O bottleneck in image loading

In [None]:
print(f"type train dataset: {type(train_dataset)}")
train_dataset = train_dataset.prefetch(buffer_size=4)
val_dataset = val_dataset.prefetch(buffer_size=4)
test_dataset = test_dataset.prefetch(buffer_size=4)

To avoid overfitting and enhance the dataset we introduce data augmentation during training by randomly flipping and rotating the images. This layer is automatically only active during training

In [None]:
data_augm_layer = tf.keras.Sequential([
  tf.keras.layers.RandomFlip('horizontal'),
  tf.keras.layers.RandomRotation(0.2),
])

We also need to preprocess the images to bring them into the correct format, although **for Efficient net, this is not required** as the preprocessing layer is part of the model already. For Resnet we would use:

In [None]:
# preproc_layer = tf.keras.applications.resnet.preprocess_input 

# TODO Data Selection

To speed up training and convergence, we select a part of the data that is most relevant for training. Triplets where the reference image is very similar to the postive match and very different to the negative match don't provide meaningful training. Imagine the corner case of the anchor image being white, the correct match also being white and the incorrect match being black, this triplet will be correctly matched with very little influence of the network. As detailed in this paper: https://arxiv.org/pdf/1503.03832.pdf for training it is ideal to select those triplets that are hard to match, i.e. just looking at the image the similar ones looke very different and the different ones look similar.



# Loss
We implement a triplet loss as shown in this example for image similarity estimation https://keras.io/examples/vision/siamese_network/.

First we find the squared difference between the extracted features to get an estimate of similarity. We want the difference between the actual image the the similar image to be small -> i.e. the loss should decrease if this difference decreases. We also want to maximumize the difference between the reference and the different image, i.e. the loss should decrease if the difference increases. We can achieve the latter by adding a "-" sign to the latter difference.

The loss is then
‖f(A) - f(P)‖² - ‖f(A) - f(N)‖²

Where f(.) are the output features of the network.

Additionally we can enforce a margin between positive and negative pairs by adding it to the cost function as detailed in https://arxiv.org/pdf/1503.03832.pdf. This can be imagined like moving the softplus function upwards.

Different than the link above, we then finally constrain the loss to be always positive with a softplus function to guarantee that the loss always remains differentiable.

In [72]:
SIMILARITY_MARGIN = 0.1

def calc_difference(out_features):
    '''Compute the difference between the considered image and the alledged 
    similar and different one respectively. Returns two positive numbers'''
    image = out_features[...,0]
    similar = out_features[...,1] 
    different = out_features[...,2]
    # sum of squared differences
    sim_diff = tf.reduce_sum(tf.square(image - similar),1)
    dif_diff = tf.reduce_sum(tf.square(image - different),1)
    return sim_diff, dif_diff

def triplet_loss_function(y_true,out_features):
    '''loss function to minimize during back propagation\n
       One has to define a loss function with y_true and y_pred as arguments'''
    sim_diff, dif_diff = calc_difference(out_features)
    return tf.reduce_mean(tf.math.softplus(sim_diff-dif_diff + SIMILARITY_MARGIN))

# Profiling

To find bottlenecks, we add a profiler which writes to the log directory

In [73]:
log_dir = path_to_folder + "log/"

# callback only active for batches 10 to 15
tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir, write_steps_per_second=True, profile_batch=(10, 15))


# Creating The Model (LASSE UND ADRIAN)

The model should have a pretrained encoder, apply them to all three images and stich them together with custom layers. ADRIAN AND LASSE

In [74]:
class model_manager:
    def __init__(self,img_height,img_width):
        self.IMG_HEIGHT = img_height
        self.IMG_WIDTH = img_width
        self.inputs = tf.keras.Input(shape=(3, self.IMG_HEIGHT, self.IMG_WIDTH, 3))
        #create data augmentation by randomly switching horizontally and randomly rotate by 0.1
        data_augmentation = tf.keras.Sequential(
            [
                tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
                tf.keras.layers.experimental.preprocessing.RandomRotation(0.1),
            ]
        )
        #EfficientNet does not need preprocessing and expects inputs in tensor form in the range 0-255
        encoder =  tf.keras.applications.EfficientNetB4(
            include_top=False, input_shape=(self.IMG_HEIGHT, self.IMG_WIDTH, 3), 
            weights="imagenet")
        encoder.trainable = False
        decoder = tf.keras.Sequential([
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(32),
            tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
        ])
        image, similar, different = self.inputs[:, 0, ...], self.inputs[:, 1, ...], self.inputs[:, 2, ...]
        image_features = decoder(encoder(data_augmentation(image)))
        similar_features = decoder(encoder(data_augmentation(similar)))
        different_features = decoder(encoder(data_augmentation(different)))
        out_features = tf.stack([image_features,similar_features,different_features],axis=-1)
        self.model = tf.keras.Model(inputs=self.inputs,outputs=out_features)
        print("Successfully built basic model!")

    def compile(self):
        print("Compilation Initiated...")
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.002),
                            loss=triplet_loss_function,
                            metrics=triplet_loss_function)
        print("NN compiled successfully!")

    def fit(self,_train_ds, _valid_ds,  _epochs=1 ,_verbose=1):
        #59515 being the length of the training samples
        print(f"Train Batch size: {BATCH_SIZE}")
        print(f"Validation Batch size: {BATCH_SIZE}")
        print(f"Epochs: {_epochs}")
        # ceil ensures we see all data at least once
        train_steps = np.ceil(TRAIN_DATASET_SIZE/BATCH_SIZE)
        print(f"Train Steps: {train_steps}")
        # hardcode val steps as we don't need to take all data
        val_steps = 5 #np.ceil(VAL_DATASET_SIZE/BATCH_SIZE)
        print(f"Validation Steps: {val_steps}")
        print(f"Starting fitting procedure...")
        self.model.fit(_train_ds,
                       batch_size=BATCH_SIZE, 
                       epochs=_epochs,
                       validation_data=_valid_ds,
                       verbose=_verbose, 
                       steps_per_epoch=train_steps, 
                       validation_steps=val_steps, 
                       callbacks=[tb_callback]) # this adds the profiler 
        print(f"Fitting procedure finished!")

    def add_predictor(self):
        sim_diff, dif_diff = calc_difference(self.model.output)
        prediction = tf.cast(tf.greater_equal(dif_diff,sim_diff),tf.int8)
        self.model = tf.keras.Model(inputs=self.model.input,outputs=prediction)

# Main

In [None]:
# check if we have available GPU
if not tf.config.list_physical_devices('GPU'):
  has_gpu = False
  print("TRAINING WITHOUT GPU MIGHT TAKE FOREVER")
  # disable this logging, as it will otherwise spam the output
else:
  has_gpu = True

with tf.device('/device:GPU:0') if has_gpu else tf.device('/device:CPU:0'):
  manager = model_manager(256,256)
  #manager.compile()
  #manager.fit(train_dataset, val_dataset)
  #check for saved model or create new 
  if(os.path.isdir(path_to_folder + "model_10")):
    # as we load only for predicting, we can set compile to false and don't
    # need to pass the loss function
    manager.model = tf.keras.models.load_model(path_to_folder + "model_10", compile=False)
    print("Successfully loaded model!!!!")
    manager.model.summary()
  else:
    manager.compile()
    manager.model.summary()
    manager.fit(train_dataset, val_dataset, _epochs=10)
    manager.model.save(path_to_folder + "model_10")

  # sim_diff, dif_diff = manager.calc_difference()
  # greq = tf.greater_equal(dif_diff,sim_diff)
  # prediction = tf.cast(greq,tf.int8)
  # manager.model = tf.keras.Model(inputs=manager.inputs,outputs=prediction)
  manager.add_predictor()
  print("Start Prediction")
  print(f"Batch Size: {TEST_BATCH_SIZE}")
  print(f"Dataset Size: {TEST_DATASET_SIZE}")
  # ceil ensures we see all data at least once
  test_steps = np.ceil(TEST_DATASET_SIZE/TEST_BATCH_SIZE)
  print(f"Steps: {test_steps}")
  # uncomment to test out with one batch:
  # predictions = manager.model.predict(test_dataset.take(1), verbose=1, steps=1)
  predictions = manager.model.predict(test_dataset, verbose=1, steps=test_steps)
  np.savetxt(path_to_folder + 'predictions.txt', predictions,fmt='%i')

# Analyse Runtime

In [None]:
%load_ext tensorboard
%tensorboard --logdir=log_dir

# if nothing shows up, uncomment the following instead open a browser and open http://localhost:8080 
# %tensorboard --logdir=log_dir --host=127.0.0.1 --port=8080

# TODO: Visualize Results

# Testing Section
