In [32]:
# Importing the relevant Python libraries

%reset -f

# General libraries
import numpy as np
import pandas as pd
import scipy
import os
import cv2
from PIL import Image

# ML libraries
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from fastai.vision.all import *
import optuna

# Changing a few default options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold = 1e6)

In [33]:
# Loading the RESNET34 model using Fastai and using transfer training to train a CNN to find out if there is a cat in a picture or not.

# # Kaggle directory
#path_model= os.path.abspath("cat_model.pkl")

# Local directory
path_model= os.path.abspath("cat_model.pkl")

def is_cat(x): return x[0].isupper()

try: # Loading the pre-trained model if it exists
    learn = load_learner(path_model)
except:  # If the pre-trained model does not exist, we need to train it.
    path = untar_data(URLs.PETS)/'images' # Downloading and extracting the pictures used to do the transfer training

    dls = ImageDataLoaders.from_name_func(path, get_image_files(path), valid_pct=0.2, seed=42, label_func=is_cat, item_tfms=Resize(224)) # Loading and pre-processing step - Fastai

    learn = cnn_learner(dls, resnet34, metrics=error_rate) # Instantiating the model
    learn.fine_tune(1) # Training the model
    learn.export(path_model) # Exporting the model to a file

In [34]:
# Function used to extract features from a given picture

def image_features(img,filename):
    list_feat = dict() # Initialising a new dictionary
    list_feat['Id']=filename.replace(".jpg", "") # The Id is the name of the file
    list_feat['img_x']=img.shape[0] # Dimension of the image - x axis
    list_feat['img_y']=img.shape[1] # Dimension of the image - y axis
    
    with learn.no_bar(), learn.no_logging():
        _,_,probs = learn.predict(img)  # Determining the probability that there is a cat in the picture
        list_feat['prob_cat']=probs[1].numpy() # Adding it to the dictionary
                                   
    return list_feat

In [35]:
# Extracting features from all pictures

try: # If the data has already been processed and stored in two separate files
    data_train=pd.read_csv('data_train.csv')
    data_test=pd.read_csv('data_test.csv')
except: # If the data has not been processed yet
    
    # # Kaggle directory
    # dir_gen='/kaggle/input/petfinder-pawpularity-score'
    # dir_train='/kaggle/input/petfinder-pawpularity-score/train'
    # dir_test='/kaggle/input/petfinder-pawpularity-score/test'

    # Local directory
    dir_gen='../data'
    dir_train='../data\\train'
    dir_test='../data\\test'

    data_train=pd.read_csv(os.path.join(dir_gen, 'train.csv')) # Loading the initial data as provided by Kaggle
    data_test=pd.read_csv(os.path.join(dir_gen, 'test.csv')) # Loading the initial data as provided by Kaggle

    image_data_train=[] 
    image_data_test=[]

    for dirname, _, filenames in os.walk(dir_gen): # Listing all files and directories in dir_gen
        for filename in filenames: # Looping over the filenames
            if filename[-3:]=='jpg': # Checking whether the file is a jpg image
                img = cv2.imread(os.path.join(dirname, filename)) # Loading the image
                if dirname==dir_train: # If the image is in the training set
                    image_data_train.append(image_features(img,filename)) # Computing the features and appending to the list
                elif dirname==dir_test: # If the image is in the testing set
                    image_data_test.append(image_features(img,filename)) # Computing the features and appending to the list

    df_add_train = pd.DataFrame(image_data_train) # Converting the list to a dataframe
    data_train = pd.merge(data_train, df_add_train, how="inner", left_on='Id', right_on='Id') # Merging the two dataframes
    data_train.set_index('Id', inplace=True) # Setting the index

    df_add_test = pd.DataFrame(image_data_test) # Converting the list to a dataframe
    data_test = pd.merge(data_test, df_add_test, how="inner", left_on='Id', right_on='Id') # Merging the two dataframes
    data_test.set_index('Id', inplace=True) # Setting the index

In [36]:
data_train.head(15) # Checking the first 15 rows of the dataframe corresponding to the training set

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,img_x,img_y,prob_cat
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,720,405,4.942153e-07
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,774,1032,1.0
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,960,720,0.0008513125
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,720,405,1.401318e-06
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,960,540,0.001987255
5,001dd4f6fafb890610b1635f967ea081,0,0,1,0,0,0,0,0,0,0,0,1,74,960,540,5.182422e-09
6,0023b8a3abc93c712edd6120867deb53,0,1,1,1,0,0,0,0,1,1,0,0,22,960,720,1.94621e-06
7,0031d6a9ef7340f898c3e05f92c7bb04,0,1,1,0,0,0,1,1,0,0,1,0,35,1280,1280,0.02425766
8,0042bc5bada6d1cf8951f8f9f0d399fa,0,1,1,1,0,0,0,0,0,0,0,0,53,960,720,1.0
9,0049cb81313c94fa007286e9039af910,0,1,1,1,0,0,0,0,0,0,0,0,21,847,1279,1.0


In [37]:
data_test.head(15) # Checking the first 15 rows of the dataframe corresponding to the testing set

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,img_x,img_y,prob_cat
0,4128bae22183829d2b5fea10effdb0c3,1,0,1,0,0,1,1,0,0,1,0,1,128,128,0.000891
1,43a2262d7738e3d420d453815151079e,0,1,0,0,0,0,1,1,0,0,0,0,128,128,0.000472
2,4e429cead1848a298432a0acad014c9d,0,0,0,1,0,1,1,1,0,1,1,1,128,128,0.000619
3,80bc3ccafcc51b66303c2c263aa38486,1,0,1,0,0,0,0,0,0,0,1,0,128,128,0.000552
4,8f49844c382931444e68dffbe20228f4,1,1,1,0,1,1,0,1,0,1,1,0,128,128,0.000462
5,b03f7041962238a7c9d6537e22f9b017,0,0,1,1,1,1,1,1,1,0,1,0,128,128,0.001552
6,c978013571258ed6d4637f6e8cc9d6a3,1,0,0,0,1,1,0,1,0,1,1,1,128,128,0.001804
7,e0de453c1bffc20c22b072b34b54e50f,1,0,1,0,0,0,0,0,1,0,0,1,128,128,0.00047


In [38]:
data_train.describe() # Generating descriptive statistics about the training set

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,img_x,img_y,prob_cat
count,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0,9912.0
mean,0.027643,0.772599,0.903955,0.861582,0.009988,0.067797,0.129338,0.049637,0.166263,0.172014,0.061239,0.07042,38.039044,904.284302,804.426251,0.4980068
std,0.163957,0.419175,0.294668,0.345356,0.099444,0.251409,0.335591,0.217204,0.372335,0.377411,0.23978,0.255866,20.59199,156.90598,270.211921,0.4848534
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,113.0,90.0,3.600814e-12
25%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,908.75,675.0,1.229047e-05
50%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,960.0,720.0,0.4742918
75%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,960.0,960.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,1280.0,1280.0,1.0


In [39]:
# Checking if there is missing data in the training set
data_train_NaN=data_train[data_train.isnull().any(axis=1)]
data_train_NaN.head()
# Conclusion: there is no missing data in the training set

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,img_x,img_y,prob_cat


In [40]:
# Checking if there is missing data in the testing set
data_test_NaN=data_test[data_test.isnull().any(axis=1)]
data_test_NaN.head()
# Conclusion: there is no missing data in the testing set

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,img_x,img_y,prob_cat


In [41]:
Features=list(set(data_train.columns.values)-{'Pawpularity', 'Id'}) # Listing all features (i.e. removing the Id and output vectors)
Best_Feat=[]

for feature in sorted(Features): # Looping over all features
    try:
        pearson_coef, p_value = scipy.stats.pearsonr(data_train['Pawpularity'], data_train[feature]) # Computing the correlation coefficient and p-value between that feature and the Pawpularity score
        print(feature,"- Pawpularity: the Pearson correlation coefficient is", round(pearson_coef,3), " with a P-value of", format(p_value,".3E")) 
        if abs(pearson_coef)>0.00 and p_value<.05: # Checking if the correlation coefficient is statistically significant
            Best_Feat.append(feature) # Appending the feature to the list of best features
    except:
        pass

print('\nThe best features are:',Best_Feat) # Printing the best features 

Accessory - Pawpularity: the Pearson correlation coefficient is 0.013  with a P-value of 1.859E-01
Action - Pawpularity: the Pearson correlation coefficient is -0.001  with a P-value of 8.913E-01
Blur - Pawpularity: the Pearson correlation coefficient is -0.024  with a P-value of 1.909E-02
Collage - Pawpularity: the Pearson correlation coefficient is 0.002  with a P-value of 8.631E-01
Eyes - Pawpularity: the Pearson correlation coefficient is -0.007  with a P-value of 5.057E-01
Face - Pawpularity: the Pearson correlation coefficient is 0.008  with a P-value of 4.248E-01
Group - Pawpularity: the Pearson correlation coefficient is 0.016  with a P-value of 1.011E-01
Human - Pawpularity: the Pearson correlation coefficient is 0.004  with a P-value of 6.917E-01
Info - Pawpularity: the Pearson correlation coefficient is -0.005  with a P-value of 6.374E-01
Near - Pawpularity: the Pearson correlation coefficient is 0.001  with a P-value of 9.206E-01
Occlusion - Pawpularity: the Pearson correla

In [42]:
X_norm=data_train[Best_Feat].append(data_test[Best_Feat]) # Appending the training and testing sets

X_norm=preprocessing.StandardScaler(with_mean=True, with_std=True).fit_transform(X_norm) # Normalising the features

# Defining the feature matrices and output vector
Y=data_train['Pawpularity'].values
X=X_norm[0:len(Y),:]  
X_test=X_norm[len(Y):len(X_norm),:]

# Checking the shape of the resulting matrices/vectors
print('Shape of Y:', Y.shape)
print('Shape of X:', X.shape)
print('Shape of X_test:',X_test.shape)

Shape of Y: (9912,)
Shape of X: (9912, 4)
Shape of X_test: (8, 4)


In [43]:
# Training the Gradient Boosting algorithm and optimising the hyperparameters using Optuna

clf = GradientBoostingRegressor(loss='squared_error', learning_rate=0.05, random_state=2) # Gradient Boosting Regressor

# Listing the hyperparameters to optimise along with the corresponding minimum and maximum admissible values and the type of distribution
param_distributions = {
    "ccp_alpha": optuna.distributions.UniformDistribution(0, 1e-5),
    "subsample": optuna.distributions.UniformDistribution(0.5, 1),
    "n_estimators": optuna.distributions.IntUniformDistribution(1, 600),  
    "max_depth": optuna.distributions.IntUniformDistribution(1, 5),
    #"min_samples_split": optuna.distributions.IntUniformDistribution(2, 10),
    #"min_samples_leaf": optuna.distributions.IntUniformDistribution(1, 5),
}

# Optuna is used to find the optimal hyperparameters
optuna_search = optuna.integration.OptunaSearchCV(
    clf, param_distributions, n_trials=100, verbose=2, n_jobs=-1, cv=5, scoring='neg_root_mean_squared_error'
)

optuna_search.fit(X, Y)

print("Best trial:")
trial = optuna_search.study_.best_trial

print("  Value: ", -trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
pred_GB=optuna_search.predict(X) # Predicting the output using the training set
acc_training_GB=metrics.mean_squared_error(Y,pred_GB, squared=False) # Computing the error using the training set

print('\nRMSE - Training set:',acc_training_GB)
print('Mean RMSE - CV set:',-trial.value)

  optuna_search = optuna.integration.OptunaSearchCV(
[32m[I 2022-01-13 11:09:28,877][0m A new study created in memory with name: no-name-17161d5d-6253-41ad-a083-49b85e2e06ee[0m
[32m[I 2022-01-13 11:09:28,877][0m Searching the best hyperparameters using 9912 samples...[0m
[32m[I 2022-01-13 11:09:30,768][0m Trial 0 finished with value: -20.290465691530255 and parameters: {'ccp_alpha': 8.152829101606788e-06, 'subsample': 0.5358419530089343, 'n_estimators': 42, 'max_depth': 2}. Best is trial 0 with value: -20.290465691530255.[0m
[32m[I 2022-01-13 11:09:30,939][0m Trial 1 finished with value: -20.286484407898772 and parameters: {'ccp_alpha': 3.689990210582204e-06, 'subsample': 0.9706147235989453, 'n_estimators': 43, 'max_depth': 2}. Best is trial 1 with value: -20.286484407898772.[0m
[32m[I 2022-01-13 11:09:33,977][0m Trial 7 finished with value: -20.290227312944204 and parameters: {'ccp_alpha': 4.883164857410489e-06, 'subsample': 0.9199173607822524, 'n_estimators': 113, 'max_

Best trial:
  Value:  20.278767010762547
  Params: 
    ccp_alpha: 5.690293143117928e-07
    subsample: 0.6759573704682605
    n_estimators: 41
    max_depth: 3

RMSE - Training set: 20.145144825137592
Mean RMSE - CV set: 20.278767010762547


In [44]:
Y_test = optuna_search.predict(X_test) # Predicting the output

# Writing the results to a file
final_data=np.column_stack((data_test.index.astype(str), np.transpose(Y_test).astype(float)))
np.savetxt("submission.csv", final_data, delimiter=",", header='Id,Pawpularity', fmt='%s,%f', comments='')