## libraries and data Loading

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

sns.set_palette(sns.color_palette("Set2"))

import sklearn
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as tfl

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor, DaskLGBMRegressor

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cluster import KMeans

from PIL import Image
import os

np.random.seed(0)
tf.random.set_seed(0)

get data and make `Id` contain the path of image

In [None]:
train = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
train['path'] = '/kaggle/input/petfinder-pawpularity-score/train/' + train['Id'] + '.jpg'
train.head(3)

In [None]:
test = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')
test['path'] = '/kaggle/input/petfinder-pawpularity-score/test/' + test['Id'] + '.jpg'

## Exploratory Data Analysis (EDA)

check the shape and NULL values

In [None]:
print(f'shape: {train.shape}')
print(train.info())

Statistics of the data

In [None]:
train.describe()

distribution of `Pawpularity`

In [None]:
plt.figure(figsize=(10,4))

sns.histplot(data=train, x='Pawpularity');

Some visuals 😁

In [None]:
plt.figure(figsize=(20,10))
plt.tight_layout()

for i in range(12):
    plt.subplot(3, 4, i+1)
    sns.countplot(data=train, x=train.columns[1:13][i])

## Feature Engineering

Add size and shape of images to the data

In [None]:
def size_and_shape(row):
    img = Image.open(row['path'])
    return pd.Series([img.size[0], img.size[1], os.path.getsize(row['path'])])

Normalize the data

In [None]:
scale = MinMaxScaler()

train[['width', 'height', 'size']] = pd.DataFrame(scale.fit_transform(train.apply(size_and_shape, axis=1).values))
test[['width', 'height', 'size']] = pd.DataFrame(scale.fit_transform(test.apply(size_and_shape, axis=1).values))

Create our features->traget variables

In [None]:
X = train.drop(['Id', 'Pawpularity', 'path'], axis=1)
y = train['Pawpularity']

Add Clusters to the data (improved performence 😄)

In [None]:
k = KMeans(8, random_state=0)

k.fit(X)

X['cluster'] = k.predict(X)
test['cluster'] = k.predict(test.drop(['Id', 'path'], axis=1))

Add Principal Component Analysis features to the data (Also improved performence 🤩)

In [None]:
p = PCA()

p.fit(X)

X = X.join(pd.DataFrame(p.transform(X)))
test = test.join(pd.DataFrame(p.transform(test.drop(['Id', 'path'], axis=1))))

Split our data into train and validate (here named test 😅)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Model Training

Simple train+eval function

In [None]:
def eval_model(model):
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)

    return mean_squared_error(y_test, preds, squared=False)

here I used multiple models (also did manual parameters tuning 😫)

In [None]:
xgb = XGBRegressor(seed=0,
                   learning_rate =0.1,
                     n_estimators=70,
                     max_depth=2,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.2)

light = LGBMRegressor(random_state=0,
                      num_leaves=4,
                      subsample_for_bin=20, 
                      min_split_gain=0.1,
                      min_child_samples=25,
                      reg_lambda=0.4)

cat = CatBoostRegressor(random_seed=0, 
                          verbose=0, 
                          num_trees=11, 
                          learning_rate=0.26,
                          l2_leaf_reg=2.5, 
                          random_strength=0.9)

extra = ExtraTreesRegressor(random_state=0, 
                            max_depth=4, 
                            min_samples_leaf=2, 
                            max_features=6, 
                            max_leaf_nodes=20)

rf = RandomForestRegressor(random_state=0, 
                              n_estimators=10,
                              max_depth=5, 
                              max_features=9)

gb = GradientBoostingRegressor(random_state=0, 
                               n_estimators=40, 
                               subsample=0.44)


models = {'rf': rf, 'gb': gb, 'extra': extra, 'cat': cat, 'light': light, 'xgb': xgb}

for model in models:
    print(model, 'RMSE:', eval_model(models[model]))

In [None]:
# models = {'cat': cat, 'light': light, 'gb': gb}

models_vote = [(model, models[model]) for model in models]

vote = VotingRegressor(estimators=models_vote)

vote.fit(X_train, y_train)

eval_model(vote)

## Submit

In [None]:
# test['Pawpularity'] = vote.predict(test.drop(['Id', 'path'], axis=1))
# test[['Id', 'Pawpularity']].to_csv('submission.csv', index=False)

## Add Images

To be Continued...<br>
**If liked, Please Upvote :D**

In [None]:
plt.figure(figsize=(20, 7))

for i in range(12):
    plt.subplot(2, 6, i+1)
    img = Image.open(train['path'].iloc[i])
    plt.imshow(img)
    title = plt.title(train.Pawpularity[i])
    plt.setp(title, color='r')  

In [None]:
X_train['path'] = train['path'].iloc[X_train.index]
X_train['y'] = y_train

X_val = X_test

X_val['path'] = train['path'].iloc[X_val.index]
X_val['y'] = y_test

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
IMG_SIZE = 224
BATCH_SIZE = 64

@tf.function
def process_img(path: str, X_meta: pd.DataFrame, val= False) -> tf.Tensor:
    img = tf.io.decode_jpeg(tf.io.read_file(path), channels=3)
    img = tf.cast(img, dtype=tf.float64)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = keras.applications.efficientnet.preprocess_input(img)
    img = tf.cast(img, dtype=tf.float64)
    
    
    if not val:
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_saturation(img, 0.95, 1.05)
        img = tf.image.random_contrast(img, 0.95, 1.05)
        
    return (img, X_meta)


@tf.function
def process_img_label(path: str, X_meta, label: int, val=False) -> tuple:
    img = process_img(path, val)
    return process_img(path, X_meta, val), label


    
@tf.function
def img_meta_data(X, val=False) -> tf.data.Dataset:
    if 'y' in X.columns==False:
        data = tf.data.Dataset.from_tensor_slices((X['path'], X.drop('path', axis=1)))
        return data.map(process_img).batch(BATCH_SIZE).prefetch(AUTOTUNE)
    else:
        data = tf.data.Dataset.from_tensor_slices((X['path'], X.drop('path', axis=1)))
        return data.map(lambda x: process_img_label(x['path'], x.drop(['path', 'y'], axis=1), x['y'], val)).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [None]:
train

In [None]:
img_meta_data(X_train)

In [None]:
img_meta_data(train['path'].iloc[X_train.index], X_train, y_train)

In [None]:
train_meta = meta_data(X_train, y_train)
val_meta = meta_data(X_test, y_test)
test_meta = meta_data(test.drop(['Id', 'path'], axis=1))

In [None]:
train_imgs = img_data(train['path'].iloc[X_train.index], y_train)
val_imgs = img_data(train['path'].iloc[X_test.index], y_test, val=True)
test_imgs = img_data(test['path'])

In [None]:
X_train.shape

In [None]:
eff_model = keras.models.load_model('/kaggle/input/keras-applications-models/EfficientNetB0.h5')
eff_model.trainable = False


img_input = tfl.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
meta_input = tfl.Input(shape=(32,))

X = eff_model(img_input)
X = tfl.BatchNormalization()(X)

con = tfl.concatenate([X, meta_input])

X = tfl.Dense(64, activation='relu')(con)
X = tfl.Dense(64, activation='relu')(X)

X = tfl.Dropout(0.2)(X)

out = tfl.Dense(1)(X)

model = keras.Model(inputs=[img_input, meta_input], outputs=out)

In [None]:
early_stop = keras.callbacks.EarlyStopping(
        patience=5,
        restore_best_weights=True)

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=100,
    decay_rate=0.96,
    staircase=True)

In [None]:
model.compile(keras.optimizers.Adam(learning_rate=lr_schedule), 
            loss='mse', 
            metrics=[keras.metrics.RootMeanSquaredError()])

model.summary()

In [None]:
val_meta

In [None]:
train_imgs

In [None]:
ds = train_imgs.concatenate(train_meta)

In [None]:
history = model.fit(ds)
#                 validation_data=np.array([val_imgs, val_meta]),
#                 epochs = 20)#,
#                 #callbacks=[early_stop])

In [None]:
model.predict(test_imgs)*100

In [None]:
test['Pawpularity'] = model.predict(test_imgs)*100
test[['Id', 'Pawpularity']].to_csv('submission.csv', index=False)