In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import sample
import cv2

In [2]:
dat = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")

In [3]:
y = dat["Pawpularity"].to_numpy()
y = y/100

In [4]:
class data:
    def __init__(self,path,ids,x=224,y=224,labels=None):
        self.x = x
        self.y = y
        self.labels = labels
#         self.image_list = [t.split(".")[0] for t in os.listdir(path)] 
        self.image_list = ids
        self.path = path
        self.batch = 0
        
    def load_batch(self,batch_size=1,shuffle=False):
        if shuffle:
            b = self.batch
            batch_list = self.image_list[b*batch_size:(b+1)*batch_size]
            self.batch = b+1
            if self.batch>len(self.image_list)//batch_size:
                self.batch=0
        else:
            batch_list = sample(self.image_list,batch_size)
        images = np.array([cv2.cvtColor(cv2.resize(cv2.imread(self.path+image+".jpg"),(self.x,self.y)),cv2.COLOR_BGR2RGB) for image in batch_list])
        labels = self.labels.loc[batch_list].to_numpy()/100

        return images,labels
    
    def loader(self,batch_size=1,shuffle=False):
        while True:
            x,y = self.load_batch(batch_size,shuffle)
            yield x,y

ids = dat["Id"].to_list()
train_ids = ids[:int(len(ids)*0.8)]
val_ids = ids[int(len(ids)*0.8):int(len(ids)*0.9)]
test_ids = ids[int(len(ids)*0.9):]

path = "../input/petfinder-pawpularity-score/train/"
labels = dat.set_index("Id")["Pawpularity"]
c = data(path,labels=labels,ids=ids)
c_train = data(path,labels=labels,ids=train_ids)
c_val = data(path,labels=labels,ids=val_ids)
c_test = data(path,labels=labels,ids=test_ids)

In [5]:
from tensorflow import keras 
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.losses import *
from tensorflow.keras.optimizers import *

In [6]:
import tensorflow 
from tensorflow.nn import sigmoid_cross_entropy_with_logits as scel
from tensorflow.math import reduce_mean as rmean
def loss(y_true,y_pred):
    return rmean(scel(y_true,y_pred),axis=-1)
from tensorflow.keras.callbacks import ModelCheckpoint

In [7]:
from tensorflow.keras.applications import EfficientNetB7
cnn_model = EfficientNetB7(weights='imagenet', include_top=False)

for layer in cnn_model.layers:
    layer.trainable = False
    
x = cnn_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(32, activation='relu')(x)

###REGRESSION
# predictions = Dense(1, activation='relu')(x)
###CLASSIFICATION
predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=cnn_model.input, outputs=predictions)

In [8]:
checkpoint_filepath = 'efnetb7_1.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [9]:
# model.compile(optimizer=Adam(1e-5),loss="mse")     ####REGRESSION MODEL
model.compile(optimizer=Adam(1e-4),loss="mse")#,metrics="mse")     ####CLASSIFICATION MODEL

## TO DECIDE THE TRAINING PARAMS
model.fit_generator(c_train.loader(batch_size=512,shuffle=True),validation_data=c_val.loader(batch_size=128),
                    validation_steps=10,steps_per_epoch=10,epochs=6,callbacks=[model_checkpoint_callback])
# model.fit_generator(c_train.loader(batch_size=32),validation_data=c_val.loader(batch_size=32),validation_steps=50,steps_per_epoch=100,epochs=2)

In [10]:
#### ACTUAL TRAINING
model = load_model("efnetb7_1.h5")
model.compile(optimizer=Adam(1e-5),loss="mse")     ####CLASSIFICATION MODEL
model.fit_generator(c_train.loader(batch_size=128,shuffle=True),validation_data=c_val.loader(batch_size=128),
                    validation_steps=10,steps_per_epoch=40,epochs=3,callbacks=[model_checkpoint_callback])

In [11]:
## VISUALIZE THE DISTRIBUTION OF ERROR (MAE)
model = load_model("efnetb7_1.h5")
images = np.array([cv2.cvtColor(cv2.resize(cv2.imread(path+image+".jpg"),(224,224)),cv2.COLOR_BGR2RGB) for image in test_ids])
y = labels.loc[test_ids]
y_pred = model.predict(images)
plt.hist(np.abs(y.to_numpy().reshape((-1,1))/100-y_pred))
print("METRIC : ",np.sqrt(np.sum((y.to_numpy().reshape((-1,1))-y_pred*100)**2)/(y_pred.shape[0])))

In [12]:
del images
plt.figure()
plt.hist(y)
plt.figure()
plt.hist(y_pred)

In [13]:
testpath = "../input/petfinder-pawpularity-score/test/"
ids = [t.split(".")[0] for t in os.listdir(testpath)]

batch_size=200
y_pred=np.zeros((len(ids),1))
for i in range(0,len(ids)//batch_size):
    images = np.array([cv2.cvtColor(cv2.resize(cv2.imread(testpath+image+".jpg"),(128,128)),cv2.COLOR_BGR2RGB) for image in ids[batch_size*i:batch_size*(i+1)]])
    y_pred[i*batch_size:(i+1)*batch_size] = model.predict_proba(images)#*100
    
round2 = lambda x,y=None:round(x+1e-15,y)
y_pred = [round2(t[0],2) for t in y_pred]

In [14]:
df = pd.DataFrame()
df["Id"] = ids
df["Pawpularity"] = y_pred
df.to_csv("submission.csv",index=False)

In [15]:
df