# Pawpularity Contest

Submissions are scored on the root mean squared error **RMSE**.

Guides to use:
*   Good Paper ==> https://dl.acm.org/doi/pdf/10.1145/3209693.3209698
*   Multi Input ==> https://www.kaggle.com/yaniv256/tensorflow-multi-input-pet-pawpularity-model
*   Transfer Learning ==> https://tfhub.dev/

Things to do in order to increase efficiency:
1.  See correlation of Tags and Pawpularity and keep only the usefull ones!
2.  Use Transfer Learning and get a better model like ResNet!
3.  Add more tags to the dataset by using a pretrained model of classification
4.  Try common techniques for dealing with imbalanced data like:
  *  Class weighting
  *  Oversampling
5.  Try different Learning Rates and Optimizers


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, Input, Concatenate
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, LearningRateScheduler, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.losses import MeanSquaredError, MeanSquaredLogarithmicError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import plot_model
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import math

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
import datetime, os

In [None]:
tensorflow.test.gpu_device_name()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive

In [None]:
df = pd.read_csv('./train.csv')
df.head()

In [None]:
df['Id'] = df['Id'] + '.jpg';
df['Id']

In [None]:
fig = plt.figure(figsize = (15,10));
ax = fig.gca();
df['Pawpularity'].hist(ax = ax);

In [None]:
plt.figure(figsize=(15,10));
ax = sns.heatmap(df.corr());

In [None]:
# Let's assign data a class 1-5 mapping 0-100 every 20 base on its Papwpularity 
df['Class'] = df['Pawpularity']/20;
df['Class'] = df['Class'].apply(math.floor);
df = df[['Id','Pawpularity','Class']]
df.describe()

In [None]:
df_0 = df[df['Class']==0]
df_1 = df[df['Class']==1]
df_2 = df[df['Class']==2]
df_3 = df[df['Class']==3]
df_4 = df[df['Class']==4]
# Don't forget those images with score 100
df_5 = df[df['Class']==5]
df_4 = pd.concat([df_4,df_5],axis=0) 

In [None]:
len_0 = len(df_0);
len_1 = len(df_1);
len_2 = len(df_2);
len_3 = len(df_3);
len_4 = len(df_4);
total = len_0 + len_1 + len_2 + len_3 + len_4;
print(f'Total {total} rows');
print(f'Pawpularity 0-20 is only {(len_0/total)*100:.2f}% percentage of data with {len_0} records');
print(f'Pawpularity 20-40 is only {(len_1/total)*100:.2f}% percentage of data with {len_1} records');
print(f'Pawpularity 40-60 is only {(len_2/total)*100:.2f}% percentage of data with {len_2} records');
print(f'Pawpularity 60-80 is only {(len_3/total)*100:.2f}% percentage of data with {len_3} records');
print(f'Pawpularity 80-100 is only {(len_4/total)*100:.2f}% percentage of data with {len_4} records');

In [None]:
# If we keep only N from each class 1 and class 2 respectively the data would be more balanced
keep_rows = 2500 #@param {type:"slider", min:1000, max:5000, step:100}
df_1 = df_1.sample(keep_rows);
if(keep_rows<len(df_2)):
  df_2 = df_2.sample(keep_rows);
balanced_df =  pd.concat([df_0,df_1,df_2,df_3,df_4],axis=0);
balanced_df = balanced_df.sample(frac=1);

In [None]:
balanced_df.describe()

In [None]:
fig = plt.figure(figsize = (15,10));
ax = fig.gca();
balanced_df['Pawpularity'].hist(ax = ax);

In [None]:
# train_df, val_df = train_test_split(balanced_df,test_size=0.002);

# print(f'Train Dataframe has {len(train_df)} records');
# print(f'Validation Dataframe has {len(val_df)} records');
keep_from_class = 3;
values_0 = balanced_df[balanced_df['Class']==0].iloc[:keep_from_class];
df_0 = balanced_df[balanced_df['Class']==0].iloc[keep_from_class:];
values_1 = balanced_df[balanced_df['Class']==1].iloc[:keep_from_class];
df_1 = balanced_df[balanced_df['Class']==1].iloc[keep_from_class:];
values_2 = balanced_df[balanced_df['Class']==2].iloc[:keep_from_class];
df_2 = balanced_df[balanced_df['Class']==2].iloc[keep_from_class:];
values_3 = balanced_df[balanced_df['Class']==3].iloc[:keep_from_class];
df_3 = balanced_df[balanced_df['Class']==3].iloc[keep_from_class:];
values_4 = balanced_df[balanced_df['Class']==4].iloc[:keep_from_class];
df_4 = balanced_df[balanced_df['Class']==4].iloc[keep_from_class:];
value_5 = balanced_df[balanced_df['Class']==5].iloc[:keep_from_class];
df_5 = balanced_df[balanced_df['Class']==5].iloc[keep_from_class:];
train_df =  pd.concat([df_0,df_1,df_2,df_3,df_4, df_5],axis=0);
train_df = train_df.sample(frac=1);
val_df =  pd.concat([values_0,values_1,values_2,values_3,values_4, value_5],axis=0);

In [None]:
val_df.describe()

In [None]:
val_df

In [None]:
import random;
rows, cols = 3, 3;
fig, axs = plt.subplots(rows, cols, figsize=(15,15));
fig.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.2, wspace=0.4);
for i in range(rows):
    for j in range(cols):
      random_image = random.randint(0,len(df)-1);
      img = mpimg.imread('./train/'+df['Id'][random_image]);
      axs[i,j].imshow(img);
      axs[i,j].axis('off');
      axs[i,j].set_title(f'Pawpularity: {df["Pawpularity"][random_image]}',{'fontsize': 20});

In [None]:
InceptionResNetV2 = InceptionResNetV2(
    weights='imagenet',
    include_top=False,
    input_tensor=None,
    input_shape=(300,300,3),
    pooling='max'
)

In [None]:
# train = tensorflow.data.Dataset.from_tensor_slices((train_images, rest_of_train_data)).map(preprocess).shuffle(216).batch(62).prefetch(2)
# validation = tensorflow.data.Dataset.from_tensor_slices((val_images, rest_of_val_data)).map(preprocess).batch(10).prefetch(2)

train_data_generator = ImageDataGenerator(
    horizontal_flip=True,
    rotation_range=15,
    zoom_range=0.2,
    rescale = 1.0/255.0
);

val_data_generator = ImageDataGenerator(
    rescale = 1.0/255.0
);


train = train_data_generator.flow_from_dataframe(
    dataframe=train_df,
    directory="./train/",
    x_col="Id",
    y_col="Pawpularity",
    batch_size=24,
    shuffle=True,
    class_mode="raw",
    target_size=(300,300)
);

validation = val_data_generator.flow_from_dataframe(
    dataframe=val_df,
    directory="./train/",
    x_col="Id",
    y_col="Pawpularity",
    batch_size=len(val_df),
    shuffle=False,
    class_mode="raw",
    target_size=(300,300)
);

In [None]:
#  Initialization
epochs_count = 0;
rmse_history = [];
val_rmse_history = [];

In [None]:
image_input = Input(shape=(300, 300, 3));

image_x = InceptionResNetV2(image_input);
image_x = Flatten()(image_x);
output = Dense(1, activation="linear")(image_x);

model = Model(inputs=image_input,outputs=output);
model.summary()

In [None]:
# Settings
adam_lr =  0.0003#@param {type:"slider", min:0.0001, max:0.01, step:0.0001}
sgd_lr =  0.01#@param {type:"slider", min:0.001, max:0.09, step:0.001}

In [None]:
# tf.keras.optimizers.Adam(
#     learning_rate=0.0003, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
#     name='Adam', **kwargs
# )

# tf.keras.optimizers.SGD(
#     learning_rate=0.01, momentum=0.0, nesterov=False, name="SGD", **kwargs
# )

# Lets tweak learning rate to see rate of conversion
adam = Adam(learning_rate = adam_lr);
sgd = SGD(learning_rate = sgd_lr);
mse_loss = MeanSquaredError();
msle_loss  = MeanSquaredLogarithmicError (); #For SGD
rmse = RootMeanSquaredError(name='rmse');

reduce_lr = ReduceLROnPlateau(monitor='rmse', patience=3, verbose=1, factor=0.25, min_lr=0.00001);

early_stop = EarlyStopping(
    monitor="val_rmse",
    min_delta=0.02,
    patience=7,
    verbose=1,
    mode="min",
    baseline=None,
    restore_best_weights=True,
);

def scheduler(epoch, learning_rate):
    if epoch < 3:
        new_learning_rate = learning_rate
    else:  
        new_learning_rate = learning_rate * tensorflow.math.exp(-0.1);
    print(f'Learning rate = {new_learning_rate:.6f}');
    return new_learning_rate;

learning_scheduler = LearningRateScheduler(scheduler);

model.compile(loss=mse_loss, optimizer=adam, metrics=['mae',rmse]);
callbacks=[learning_scheduler, early_stop, reduce_lr];

In [None]:
epochs =  5#@param {type:"slider", min:1, max:300, step:10}
history = model.fit(train, validation_data=validation, epochs=epochs, verbose=1, callbacks=callbacks)

In [None]:
epochs_count += epochs;
rmse_history += history.history['rmse'];
val_rmse_history += history.history['val_rmse'];

In [None]:
plt.figure(figsize=(15,7))
plt.plot(rmse_history[2:])
plt.plot(val_rmse_history[2:])
plt.ylabel('RMSE')
plt.xlabel('Epoch')
plt.legend(['Train_RMSE','Validation_RMSE'])
plt.title(f'Adam LR:{adam_lr} + Epochs:{epochs_count} + Keep {keep_rows}')
plt.show()

In [None]:
# model.save( 'my_model_' + datetime.datetime.today().strftime('%Y-%m-%d-%H:%M'))

# Create submissions csv

For each Id in the test set, you must predict a probability for the target variable, Pawpularity. The file should contain a header and have the following format:

Id, Pawpularity \
0008dbfb52aa1dc6ee51ee02adf13537, 99.24 \
0014a7b528f1682f0cf3b73a991c17a0, 61.71 \
0019c1388dfcd30ac8b112fb4250c251, 6.23 \
00307b779c82716b240a24f028b0031b, 9.43 \
00320c6dd5b4223c62a9670110d47911, 70.89 \
etc.

In [None]:
test_df = pd.read_csv('./test.csv')
test_df.head()

In [None]:
test_df['file_path'] = test_df['Id'] + '.jpg';
test_df.head()

In [None]:
def test_preprocess(image, tabular):
    image_string = tensorflow.io.read_file('./test/'+image);
    image = tensorflow.image.decode_jpeg(image_string, channels=3);
    image = tensorflow.cast(image, tensorflow.float32) / 255.0;
    image = tensorflow.image.central_crop(image, 1.0);
    image = tensorflow.image.resize(image, (300, 300));
    return (image, tabular), 0

test_images = test_df['file_path'];
rest_of_test_data = test_df.drop('Id',axis=1);
rest_of_test_data = rest_of_test_data.drop('file_path',axis=1);
rest_of_test_data.head()

In [None]:
test = tensorflow.data.Dataset.from_tensor_slices((test_images, rest_of_test_data)).map(test_preprocess).batch(8).prefetch(2)

In [None]:
predicted_scores = model.predict(test).reshape(-1);
predicted_scores

In [None]:
test_df['Pawpularity'] = predicted_scores;
submission_df = test_df.reindex(['Id','Pawpularity'],axis=1);
submission_df

In [None]:
file_name = 'submission' + datetime.datetime.today().strftime('%Y-%m-%d-%H:%M') + '.csv';
submission_df.to_csv(file_name, index=False);