# Pawpularity Contest

Submissions are scored on the root mean squared error **RMSE**.

Guides to use:
*   Good Paper ==> https://dl.acm.org/doi/pdf/10.1145/3209693.3209698
*   Multi Input ==> https://www.kaggle.com/yaniv256/tensorflow-multi-input-pet-pawpularity-model
*   Transfer Learning ==> https://tfhub.dev/

Things to do in order to increase efficiency:
1.  See correlation of Tags and Pawpularity and keep only the usefull ones!
2.  Use Transfer Learning and get a better model like ResNet!
3.  Add more tags to the dataset by using a pretrained model of classification
4.  Try common techniques for dealing with imbalanced data like:
  *  Class weighting
  *  Oversampling
5.  Try different Learning Rates and Optimizers


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import numpy as np
import tensorflow
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, Input, Concatenate
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, LearningRateScheduler, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.losses import MeanSquaredError, MeanSquaredLogarithmicError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import plot_model
import math

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
import datetime, os

In [None]:
tensorflow.test.gpu_device_name()

In [None]:
tabular_columns = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

In [None]:
image_input = Input(shape=(300, 300, 3))
tabular_input = Input(len(tabular_columns))

image_x = Conv2D(filters=32, kernel_size=(5,5), activation='relu')(image_input)
image_x = Conv2D(filters=32, kernel_size=(3,3), activation='relu')(image_x)
image_x = MaxPool2D((2,2))(image_x)
image_x = Conv2D(filters=32, kernel_size=(5,5), activation='relu')(image_x)
image_x = Conv2D(filters=32, kernel_size=(3,3), activation='relu')(image_x)
image_x = MaxPool2D((2,2))(image_x)
image_x = Conv2D(filters=32, kernel_size=(3,3), activation='relu')(image_x)
image_x = Conv2D(filters=32, kernel_size=(3,3), activation='relu')(image_x)
image_x = MaxPool2D((2,2))(image_x)
image_x = Conv2D(filters=32, kernel_size=(3,3), activation='relu')(image_x)
image_x = Conv2D(filters=32, kernel_size=(3,3), activation='relu')(image_x)
image_x = MaxPool2D((2,2))(image_x)
image_x = Flatten()(image_x)
image_x = Dense(64, activation="relu", kernel_regularizer=tensorflow.keras.regularizers.l2())(image_x)

tabular_x = Dense(16, activation="relu")(tabular_input)
tabular_x = Dense(16, activation="relu")(tabular_x)
tabular_x = Dense(16, activation="relu")(tabular_x)
tabular_x = Dense(16, activation="relu", kernel_regularizer=tensorflow.keras.regularizers.l2())(tabular_x)

x = Concatenate(axis=1)([image_x, tabular_x])

x = Dense(10, activation="relu")(x)
output = Dense(1, activation="linear")(x)

model = Model(inputs=[image_input, tabular_input],outputs=[output])
model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive

In [None]:
# !unzip petfinder-pawpularity-score.zip

In [None]:
df = pd.read_csv('./train.csv')
df.head()

In [None]:
df['Id'] = df['Id'] + '.jpg';
df['Id']

In [None]:
# For SGD we have to normalize data
# df['Pawpularity'] = df['Pawpularity'] / df['Pawpularity'].max()
df['Pawpularity']

In [None]:
fig = plt.figure(figsize = (15,10));
ax = fig.gca();
df['Pawpularity'].hist(ax = ax);

## As we can see the data are imbalanced. We must do something about it later.

## Show 9 random images

In [None]:
import random;
rows, cols = 3, 3;
fig, axs = plt.subplots(rows, cols, figsize=(15,15));
fig.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.2, wspace=0.4);
for i in range(rows):
    for j in range(cols):
      random_image = random.randint(0,len(df)-1);
      img = mpimg.imread('./train/'+df['Id'][random_image]);
      axs[i,j].imshow(img);
      axs[i,j].axis('off')
      axs[i,j].set_title(f'Pawpularity: {df["Pawpularity"][random_image]}',{'fontsize': 20});

In [None]:
def preprocess(image, tabular):
    image_string = tensorflow.io.read_file('./train/'+image);
    image = tensorflow.image.decode_jpeg(image_string, channels=3);
    image = tensorflow.cast(image, tensorflow.float32) / 255.0;
    image = tensorflow.image.central_crop(image, 1.0);
    image = tensorflow.image.resize(image, (300, 300));
    return (image, tabular[0:12]), tabular[12]

In [None]:
images = df['Id'];
rest_of_data = df.drop('Id',axis=1);
rest_of_data.head()

# TO DO

* Find correlation and keep only tags that matter
* Guide https://towardsdatascience.com/annotated-heatmaps-in-5-simple-steps-cc2a0660a27d

In [None]:
train = tensorflow.data.Dataset.from_tensor_slices((images, rest_of_data)).map(preprocess).shuffle(216).batch(62).prefetch(2)

Despite the widespread popularity of Adam, recent research papers have noted that it can fail to converge to an optimal solution under specific settings. The paper Improving Generalization Performance by Switching from Adam to SGD demonstrates that adaptive optimization techniques such as Adam generalize poorly compared to SGD.

In [None]:
# Settings
adam_lr =  0.004#@param {type:"slider", min:0.0001, max:0.01, step:0.0001}
sgd_lr =  0.01#@param {type:"slider", min:0.001, max:0.09, step:0.001}

In [None]:
# tf.keras.optimizers.Adam(
#     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
#     name='Adam', **kwargs
# )

# tf.keras.optimizers.SGD(
#     learning_rate=0.01, momentum=0.0, nesterov=False, name="SGD", **kwargs
# )

# Lets tweak learning rate to see rate of conversion
adam = Adam(learning_rate = adam_lr);
sgd = SGD(learning_rate = sgd_lr);
mse_loss = MeanSquaredError();
msle_loss  = MeanSquaredLogarithmicError (); #For SGD
rmse = RootMeanSquaredError(name='rmse');

# Clear any logs from previous runs
!rm -rf ./logs

logdir = os.path.join("logs", datetime.datetime.today().strftime('%Y-%m-%d-%H:%M'))
tensorboard_callback = TensorBoard(logdir, histogram_freq=1)

reduce_lr = ReduceLROnPlateau(monitor='rmse', patience=3, verbose=1, factor=0.75, min_lr=0.00001);

early_stop = EarlyStopping(
    monitor="rmse",
    min_delta=0.01,
    patience=10,
    verbose=1,
    mode="min",
    baseline=None,
    restore_best_weights=True,
);

def scheduler(epoch, learning_rate):
  if epoch < 5:
    return learning_rate
  elif epoch >= 5:
    # tensorflow.math.exp(-0.1) equals ~0.9
    return learning_rate * tensorflow.math.exp(-0.1)

learning_scheduler = LearningRateScheduler(scheduler)

model.compile(loss=mse_loss, optimizer=adam, metrics=['mae',rmse]);
callbacks=[tensorboard_callback, reduce_lr, early_stop];

In [None]:
epochs =  100#@param {type:"slider", min:10, max:300, step:10}
model.fit(train, epochs=epochs, verbose=1, callbacks=callbacks)

In [None]:
path = logdir + '/train'
%tensorboard --logdir path

In [None]:
model.save( 'my_model_' + datetime.datetime.today().strftime('%Y-%m-%d-%H:%M'))

# Create submissions csv

For each Id in the test set, you must predict a probability for the target variable, Pawpularity. The file should contain a header and have the following format:

Id, Pawpularity \
0008dbfb52aa1dc6ee51ee02adf13537, 99.24 \
0014a7b528f1682f0cf3b73a991c17a0, 61.71 \
0019c1388dfcd30ac8b112fb4250c251, 6.23 \
00307b779c82716b240a24f028b0031b, 9.43 \
00320c6dd5b4223c62a9670110d47911, 70.89 \
etc.

In [None]:
test_df = pd.read_csv('./test.csv')
test_df.head()

In [None]:
test_df['file_path'] = test_df['Id'] + '.jpg';
test_df.head()

In [None]:
def test_preprocess(image, tabular):
    image_string = tensorflow.io.read_file('./test/'+image);
    image = tensorflow.image.decode_jpeg(image_string, channels=3);
    image = tensorflow.cast(image, tensorflow.float32) / 255.0;
    image = tensorflow.image.central_crop(image, 1.0);
    image = tensorflow.image.resize(image, (300, 300));
    return (image, tabular), 0

test_images = test_df['file_path'];
rest_of_test_data = test_df.drop('Id',axis=1);
rest_of_test_data = rest_of_test_data.drop('file_path',axis=1);
rest_of_test_data.head()

In [None]:
test = tensorflow.data.Dataset.from_tensor_slices((test_images, rest_of_test_data)).map(test_preprocess).batch(8).prefetch(2)

In [None]:
predicted_scores = model.predict(test).reshape(-1);
predicted_scores

In [None]:
test_df['Pawpularity'] = predicted_scores;
submission_df = test_df.reindex(['Id','Pawpularity'],axis=1);
submission_df

In [None]:
file_name = 'submission' + datetime.datetime.today().strftime('%Y-%m-%d-%H:%M') + '.csv';
submission_df.to_csv(file_name, index=False);