In [None]:
# Imports
import numpy as np
import tensorflow as tf
import tensorflow.keras
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import os
import skimage
from skimage import io
import seaborn as sn


# Seed
SEED = 2727
np.random.seed(SEED)
tf.random.set_seed(SEED)

%matplotlib inline

# Autocompletion
%config Completer.use_jedi = False

In [None]:
# Architecture
path_train = "ships_data/train"
path_test = "ships_data/test"
submission_file = "ships_competition.npz"

In [None]:
!rm -rf ships_data
!tar xzf /kaggle/input/navire-2022-libre/ships.tgz
!ls
!ls ships32
!mkdir ships_data
!mv ships32 ships_data/train
!mkdir ships_data/test
!ls
!ls ships_data
!ls ships_data/train

Get all the data and clean it (no black and white data otherwise it will break). As it is just some pictures, I delete them instead of coloring them myself.

In [None]:
def is_rgb_shape(im):
    return len(im.shape) == 3 and im.shape[2] == 3

def get_info_clean_data(path):
    """
    Load data from a directory
    Get info of the data
    Clean the data if necessary
    
    Return:
        * A list of images
        * A list of labels corresponding to the list of images
        * A dataframe describing the images
        * A map of label to class/category name
    """
    label_to_category = dict()

    labels = []
    imgs = []
    info = []

    for curr_label, dirname in enumerate(np.sort(os.listdir(path))):
        label_to_category[curr_label] = dirname

        dirname = os.path.join(path, dirname)
        for filename in os.listdir(dirname):
            filename = os.path.join(dirname, filename)

            img = skimage.io.imread(filename)
            # L'image n'est pas rgb donc non ajoutée (sinon ca casse tout)
            if not is_rgb_shape(img):
                os.remove(filename)
                continue

            imgs.append(img)
            info.append([filename, curr_label, img.shape[0], img.shape[1], img.dtype])
            labels.append(curr_label)

    labels = np.array(labels)
    df_input = pd.DataFrame(info, columns=["filename", "category", "width", "height", "dtype"])
    return np.array(imgs), labels, df_input, label_to_category

In [None]:
imgs, labels, df_input, label_to_category = get_info_clean_data(path_train)

label_to_category

We can already notice that images seem to have the same shapes. But let's make sure of it by diving deeper in the metadata of the input.

In [None]:
df_input

some pictures to see if it works and what they look like (the pictures are really small ... I have some difficulties to recognize a boat on some of them).

In [None]:
plt.figure(figsize=(15,5))
nb_image_row = 5
for i in range(len(label_to_category)):
    plt.subplot(3, 5, i + 1)
    plt.title(label_to_category[i])
    plt.imshow(imgs[np.argmax(labels==i)])
    plt.axis('off')

plt.show()

In [None]:
print(f"Number of images with a different width:", len(df_input.width[df_input.width != df_input.width[0]]))
print(f"Every image has the same width which is", df_input.width[0])

print(f"Number of images with a different height:", len(df_input.height[df_input.height != df_input.height[0]]))
print(f"Every image has the same height which is", df_input.height[0])

print(f"We can see that every image has the same shape which is", imgs[0].shape)
target_shape = imgs[0].shape[0: 2]

In [None]:
# display the number of picture in each category
ax = df_input.category.value_counts(sort=False).plot.bar()
ax.set_xlabel('category')
ax.set_ylabel('number of images')
ax.set_title("Number of images per category")
ax.plot()

Each category has an unbalanced set of images. It is important to know that the neural network will have an unbalanced input. Let's be careful about a category overfitting when training the neural network.

Look at the pictures for the submission if they have the same format.

In [None]:
X_submission = np.load(submission_file, allow_pickle=True)["X"]
X_submission = X_submission.astype('float32') / 255

target_shape = X_submission.shape[1:3]
X_submission.shape, X_submission.dtype, target_shape

plt.figure(figsize=(15,5))
for i in range(15):
    plt.subplot(3, 5, i + 1)
    plt.imshow(X_submission[i])
    plt.axis('off')
plt.show()

We need to split the set of images into three sets:

    train set: data used to train the model (improve the model accuracy).
    validation set: test the model accuracy during the training.
    test set: test the model accuracy after the training.

As we are going to use a keras.ImageDataGenerator, it will handle the train set, validation set and test set automatically. However, it is still required to split data into two directories:

    Test data will be located in ships_scaled/test. In order to be fair with the test data, we want the same number of images for each category.
    Train/validation data will be located in ships_scaled/train.


In [None]:
def setup_data(dir_input, dir_output, label_to_category, nb_test_img_per_category=None):
    """
    Move some images from an input directory to an output directory
    
    Arguments:
        * dir_input: the input directory
        * dir_output: the output directory
        * label_to_category: used to get the category name (thus the directory name)
        * nb_test_img_per_category: number of images per category
    """
    for label, class_name in label_to_category.items():
        class_path = os.path.join(dir_input, class_name)
        new_class_path = os.path.join(dir_output, class_name)
        os.mkdir(new_class_path)
        
        # if None move everything
        for filename in os.listdir(class_path)[:nb_test_img_per_category]:
            os.rename(os.path.join(class_path, filename),
                      os.path.join(new_class_path, filename))

In [None]:
setup_data(path_train,
           path_test,
           label_to_category,
           nb_test_img_per_category=200)

Data augmentation is a way to improve the dataset as input images are going to be randomly modified. ImageDataGenerators from keras are going to be used. We are going to have two generators a test test_datagen and a train_datagen. The last generator is going to have more work as it is going to be used for the training and we want more diversity in the training set.

Moreover, a pre-trained neural network is going to be used. In this case, input images must all be preprocessed by the same and specific preprocess function for resnet neural network keras.applications.resnet.preprocess_input. An automatic way of preprocessing input images is to give this function to a generator.

The generators also load data on the fly which, avoid memory issues. Images are saved in directories. Then, the generator will automatically load the image and find its category (from the directory name).

Finally, one of the feature of the generator is to automatically split training data and validation data.

To recap, the generators will do the following work:

    preprocess input (always)
    load data on the fly (always)
    data augmentation (if needed)
    split training and validation data (if needed)

# ImageDataGenerator for test set

The test set is here only to test the neural network accuracy. Thus, it is not needed to apply some random modifications to these images meaning data augmentation is not needed. Also, data are not split. The test images will be saved in order to check them after a prediction.


In [None]:
path_gen_test = "gen_test"
!mkdir gen_test

In [None]:
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Categories are classed in an alphabetical order
test_generator = test_datagen.flow_from_directory(
        path_test,
        target_size=target_shape,
        seed=SEED,
        shuffle=False,
        save_to_dir=path_gen_test,
        save_format="jpeg",
        interpolation="bicubic")

In [None]:
train_datagen = ImageDataGenerator(rotation_range=20,
                                   preprocessing_function=preprocess_input,
                                   horizontal_flip=True,
                                   validation_split=0.1)

train_generator = train_datagen.flow_from_directory(
        path_train,
        target_size=target_shape,
        batch_size=64,
        seed=SEED,
        subset="training",
        interpolation="bicubic")

validation_generator = train_datagen.flow_from_directory(
        path_train,
        target_size=target_shape,
        batch_size=64,
        seed=SEED,
        subset="validation",
        interpolation="bicubic")

# Model Creation

For this competition, it is required to use a problem solving technique called transfer learning. It focuses on storing knowledge gained while solving one problem and applying it to a different but related problem. The generic problem is images classification. Our related problem is ship classification.

In this notebook, we are going to use a pre-trained convolutional neural network (CNN) model and add on the top of it some custom layers. The transfer of learning comes from the pre-trained CNN model. The knowledge of the pre-trained model will be features maps. Features maps are the output of several consecutive convolutions. Each features map contains an information that will be processed by the outer layers.

This model (ResNet50) is light in terms of computation. As a result, the training time with this model was reasonable and the predictions were quite accurate as it can be seen below.


In [None]:
input_shape = (target_shape[0], target_shape[1], 3) # meme forme
input_shape

In [None]:
from tensorflow.keras.applications import ResNet50

model_resnet50 = ResNet50(include_top=False,
                          weights="imagenet",
                          input_shape=input_shape,
                          pooling='max')

In [None]:
print("Number of layers : ", len(model_resnet50.layers))
model_resnet50.summary()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization

model_custom = Sequential(name='custom_model')
model_custom.add(Input(shape=model_resnet50.output.shape[1:]))
model_custom.add(BatchNormalization(name='custom_bn'))
model_custom.add(Dense(len(label_to_category), activation='softmax'))


print("Number of layers :", len(model_custom.layers))
model_custom.summary()

In [None]:
from tensorflow.keras.optimizers import Adam

model = Sequential([model_resnet50, model_custom])

print("Number of layers : ", len(model.get_layer(name='resnet50').layers) + len(model.get_layer(name='custom_model').layers))
model.summary()

# Training
The model is simply trained with the training and validation data.

For the Optimizer, the Adam optimizer is going to be used because it seems to be the most efficient.

Two callbacks are also used to make the training more convenient. The first callback saves the best weights. Indeed, during the training, the accuracy might vary. Therefore, we want to save the best weights throughout the training. At the end of training, the best weights (with the greatest validation accuracy) are loaded into the model. The second callback stops the training when the validation accuracy has not improved for 5 times in a row. This is useful to not waste training time when it is not needed.

There are two training steps. The first step is to only train the layers in the custom on the top model. The ResNet50 layers are frozen. The features maps from ResNet50 already give great information to classify ships. During this step, 70% of validation accuracy is reached. We can also notice a dense is actually enough which confirm the efficiency of ResNet50. Then, a second training is performed. This time, the whole model (the ResNet50 model and the custom model) is trainable. During this second step, the learning is very low because it is a step of fine tuning. The weights of the ResNet50 model should only be slightly modified.

In [None]:
model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Freeze the model
model_resnet50.trainable = False
# Freeze the BatchNormalization of model custom
model_custom.get_layer('custom_bn').trainable = False

# Callbacks

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

checkpoint_filepath = 'model_checkpoint'
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,
                                            save_weights_only=True,
                                            monitor='val_accuracy',
                                            mode='max',
                                            save_best_only=True,
                                            verbose=1)

model_earlystop_callback = EarlyStopping(monitor='val_accuracy',
                                         patience=5,
                                         verbose=1)

Train

In [None]:
nb_epochs = 10

# Model weights are saved at the end of every epoch, if it is the best seen
# so far.
model_history = model.fit(x=train_generator,
                          epochs=nb_epochs,
                          verbose=1,
                          validation_data=validation_generator,
                          callbacks=[model_checkpoint_callback, model_earlystop_callback]
)

In [None]:
# Unfreeze the layers resnet50
model_resnet50.trainable = True
# Unfreeze the BatchNormalization of the model custom
model_custom.get_layer('custom_bn').trainable = True

# low learning rate
model.compile(optimizer=Adam(learning_rate=1e-5), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
nb_epochs_tuning = 20

model_history_tunning = model.fit(x=train_generator,
                                  epochs=nb_epochs_tuning,
                                  verbose=1,
                                  validation_data=validation_generator,
                                  callbacks=[model_checkpoint_callback, model_earlystop_callback]
)

# Check training results

It is now time to see the result of the training steps. History of the training is put in a DataFrame. Then, we plot the train accuracy and validation accuracy over epochs.

In [None]:
hist_custom_df = pd.DataFrame(model_history.history) 
hist_tuning_df = pd.DataFrame(model_history_tunning.history) 
hist_df = pd.concat([hist_custom_df, hist_tuning_df])
hist_df.head()

In [None]:
hist_df.tail()

In [None]:
plt.plot(hist_df.accuracy.to_numpy())
plt.plot(hist_df.val_accuracy.to_numpy())
plt.title('Model accuracy over epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.xlim([0, hist_df.index.size])
plt.axvline(x=nb_epochs, color='green')
plt.legend(['train accuracy', 'validation accuracy', 'fine-tuning time'], loc='upper left')
plt.show()

In [None]:
# load the best model
model.load_weights(checkpoint_filepath)

In [None]:
metrics = model.evaluate(test_generator)
print("The model accuracy over the test data %.2f%%"%(metrics[1] * 100))

In [None]:
model.save(filepath="output_model")

# Confusion Matrix

A confusion matrix usage is to evaluate the quality of the output of a classifier. The diagonal elements represent the number of points for which the predicted label is equal to the true label, while off-diagonal elements are those that are mislabeled by the classifier. The higher the diagonal values of the confusion matrix the better, indicating many correct predictions.

The true labels are represented by the rows and the predictions by the columns.

In [None]:
def my_plot_confusion_matrix(conf_matrix, classes, title=None):
    df_cm = pd.DataFrame(conf_matrix, classes, classes)
    plt.figure(figsize=(10,5))
    sn.set(font_scale=1.4) # taille des labels
    sn.heatmap(df_cm, cmap='Oranges', fmt='g', annot=True, annot_kws={"size": 11}) # font size
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    if title:
        plt.title(title)

In [None]:
from sklearn.metrics import confusion_matrix

# confusion matrix
y_pred = model.predict(test_generator).argmax(axis=1)
conf_matrix = confusion_matrix(test_generator.classes, y_pred)

In [None]:
my_plot_confusion_matrix(conf_matrix, label_to_category.values(), title="Confusion matrix from the model prediction")

The confusion matrix looks great. Most of the values are on the diagonal. The model has one main difficulty. It mixes smallfish and vsmallfish up. That is understandable as these two kind of ships look alike.

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_generator.classes, y_pred, target_names=label_to_category.values()))

All the scores are close to 1 besides the vsmallfish and smallfish as it could be seen from the confusion matrix.

# Submission

In this part, the submission to the competition is made. The submission images are loaded from the ships_competition.npy file. Then, the model predicts the category of the images. The result is stored in a CSV file. This CSV file is the submission file.

In [None]:
X_submission = np.load(submission_file, allow_pickle=True)["X"]
X_submission_1 = X_submission.astype(float)/255
X_submission = preprocess_input(X_submission.astype(float))
X_submission.shape, X_submission.dtype

In [None]:
# check that the data are ok and the predict seems ok
plt.figure(figsize=(15,5))
X = X_submission
pred = model.predict(X)
for i in range(15):
    plt.subplot(3, 5, i + 1)
    plt.title(label_to_category[pred[i].argmax()])
    plt.imshow(X[i])
    plt.axis('off')
plt.show()

In [None]:
y_pred = model.predict(X_submission).argmax(axis=1)
df = pd.DataFrame({"Category":y_pred})
df.to_csv("submission.csv", index_label="Id")

In [None]:
from IPython.display import FileLink
FileLink("submission.csv")

In [None]:
!head submission.csv