In [None]:
#pip install statsmodels
!pip install setuptools numpy scipy scikit-learn cython
import six

In [None]:
pip install scikit-garden

In [None]:
import os
import cv2
import time
import json
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt


from PIL import Image
from enum import Enum
from glob import glob
from tqdm import trange
from pprint import pprint
from colorama import Fore
from tqdm import tqdm_notebook
from sklearn.utils import shuffle
from IPython.display import display


from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.model_selection import *


# Neural Network Models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.layers import Input, Concatenate, BatchNormalization
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, Dropout


# Regression Models
import six
import sys
sys.modules['sklearn.externals.six'] = six
from skgarden import RandomForestQuantileRegressor,MondrianForestRegressor,MondrianTreeRegressor
from sklearn.linear_model import Lasso
#from skgarden import RandomForestQuantileRegressor
from statsmodels.formula.api import quantreg
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor,VotingRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from skgarden.quantile import ExtraTreesQuantileRegressor,DecisionTreeQuantileRegressor

warnings.filterwarnings("ignore")
AUTOTUNE = tf.data.experimental.AUTOTUNE



In [None]:
# Set random seed to enable re-production

def set_random():
    np.random.seed(100)
    random.seed(100)
    tf.random.set_seed(100)
    os.environ['PYTHONHASHSEED'] = '100'

In [None]:
# Generate data folds for cross validation

def fold_generator(data, target='Pawpularity', cv = 5):
    
    # Fold generator
    kf = StratifiedKFold(n_splits=cv)
    
    # Shuffle the dataset to generate folds
    # Since the Pawpularity ranges from 1 to 100, we classify the scores into 10 levels
    data = shuffle(data).reset_index(drop=True)
    n_bins = 10
    data['bins'] = data[target].map(lambda x: int(x/10))
    
    # Stratified sampling to construct folds
    for fid, (train_idx, valid_idx) in enumerate(kf.split(X=data, y=data['bins'])):
        data.loc[valid_idx, 'Fold'] = fid
        
    data = data.drop(["bins"], axis = 1)         
    return data 

In [None]:
# Self-defined cross validation function

def cross_valid(df, model, cv = 5):
    
    X = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
    Y = 'Pawpularity'

    r2sr_train = 0
    r2sr_valid = 0
    rmse_train = 0
    rmse_valid = 0
    
    # cv: the number of folds
    for fold in range(cv):
        train_df = df.loc[df['Fold'] != fold].reset_index(drop = True)
        valid_df = df.loc[df['Fold'] == fold].reset_index(drop = True)

        train_X = train_df[X]
        train_Y = train_df[Y]

        valid_X = valid_df[X]
        valid_Y = valid_df[Y]
        
        if type(model).__name__ == 'CatBoostRegressor':
            model.fit(train_X, train_Y, verbose=False)
        else:
            model.fit(train_X, train_Y)

        train_pred = model.predict(train_X)
        valid_pred = model.predict(valid_X)

        rmse_train += np.sqrt(mean_squared_error(train_Y, train_pred))
        rmse_valid += np.sqrt(mean_squared_error(valid_Y, valid_pred))
        
    return rmse_train/cv, rmse_valid/cv

In [None]:
# Reshape image

def image_reshape(image_path, image_size):
    # Load image
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    
    # Resize to regularize the input shape
    img = tf.image.resize(img, (image_size, image_size))
    
    # Normalize channels to range (0, 1)
    img = tf.cast(img, tf.float32) / 255.0
    
    return img

In [None]:
# Reshape image

def load_image(is_trainset):
    def just_reshape(img_path):
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.cast(img, tf.float32)
        img = tf.image.resize(img, (224, 224))
        img = tf.keras.applications.efficientnet.preprocess_input(img) 
        return img
    
    def reshape_with_label(img_path, label):
        return just_reshape(img_path), label

    return reshape_with_label if is_trainset else just_reshape

In [None]:
# Image augmentation

def augment_image(is_trainset):
    def just_augment(img):
        img = tf.image.random_flip_left_right(img)
        # img = tf.image.random_flip_up_down(img)
        img = tf.image.random_contrast(img, 0.95, 1.05)
        img = tf.image.random_saturation(img, 0.95, 1.05)
        return img
    
    def augment_with_label(img, label):
        return just_augment(img), label
    
    return augment_with_label if is_trainset else just_augment

In [None]:
# Use TensorFlow Database to speed up dataset processing

def construct_tf_dataset(df, batch_size, is_trainset = False, 
                         use_augmentation = False, use_shuffle = False):
    load_image_function = load_image(is_trainset)
    augment_image_function = augment_image(is_trainset)
    
    if is_trainset:
        dataset = tf.data.Dataset.from_tensor_slices((df['Path'].values, df['Pawpularity'].values))
        dataset = dataset.map(load_image_function, num_parallel_calls=AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((df['Path'].values))
        dataset = dataset.map(load_image_function, num_parallel_calls=AUTOTUNE)
        
    if use_augmentation:
        dataset = dataset.map(augment_image_function, num_parallel_calls=AUTOTUNE)
    
    if use_shuffle:
        dataset = dataset.shuffle(1000, reshuffle_each_iteration=True)
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
# Set random seed to enable re-production

set_random()

In [None]:
# Dataset path

train_csv = "../input/petfinder-pawpularity-score/train.csv"
test_csv = "../input/petfinder-pawpularity-score/test.csv"
submission_csv = "../input/petfinder-pawpularity-score/submission.csv"

train_dir = "../input/petfinder-pawpularity-score/train"
test_dir = "../input/petfinder-pawpularity-score/test"

In [None]:
# Load csv and image path to dataframe

data_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

data_df['Path'] = data_df['Id'].apply(lambda x : train_dir + '/' + x + '.jpg')
test_df['Path'] = test_df['Id'].apply(lambda x : test_dir + '/' + x + '.jpg')

In [None]:
# Test

display(data_df)

In [None]:
# Cross validation fold generation

data_df_cv = fold_generator(data_df, target = 'Pawpularity', cv = 5)

In [None]:
# Test

display(data_df_cv)

In [None]:
# Result set

Result_set = {
    "Model" : [],
    "rmse_train" : [],
    "rmse_valid" : []
}

In [None]:
# def trainRegModels(df : "data_file", features : list, label: str):
    
Model_dict = {
    "Lasso":Lasso(alpha = 0.1),
    "SVR": SVR(C=1.0, epsilon=0.2, kernel ='rbf'),
    "Ridge": Ridge(alpha = 1.0),
    "KernelRidge":KernelRidge(alpha=0.1),
    "ElasticNet": ElasticNet(random_state=0),
    "XGBRegressor": XGBRegressor(n_estimators=1000, max_depth=5, eta=0.01, subsample=0.7, colsample_bytree=0.6),
    "LGBMRegressor": LGBMRegressor(),
    "LinearRegression": LinearRegression(),
    "BaggingRegressor": BaggingRegressor(base_estimator=CatBoostRegressor(), n_estimators=10, random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(n_estimators=100, random_state=0,noise =0.1, n_features =20),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(max_depth=2, random_state=0),
    "GaussianProcessRegressor": GaussianProcessRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(n_estimators=100),
    "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=5),
    "CatBoostRegressor": CatBoostRegressor(iterations=200, learning_rate=0.01),
    "random forest quantile regression" : RandomForestQuantileRegressor(random_state=0, min_samples_split=10, n_estimators=1000),
    "mondrian tree" : MondrianTreeRegressor(random_state=1, max_depth=2),
    "Mondrian Forest Regression":MondrianForestRegressor(random_state=1, max_depth=2),
    "extra trees qunatile regressor":ExtraTreesQuantileRegressor(n_estimators=100, random_state=0),
    "Decision tree Quantile regressor":DecisionTreeQuantileRegressor(random_state=0),
    
    }

Model_list = list(Model_dict.keys())

for i in tqdm_notebook(range(len(Model_list))):
    model_name = Model_list[i]
    model = Model_dict[model_name]
    
    rmse_train, rmse_valid = cross_valid(data_df_cv, model, cv = 5)

    Result_set["Model"].append(model_name)
    Result_set["rmse_train"].append(rmse_train)
    Result_set["rmse_valid"].append(rmse_valid)

In [None]:
# Display result

Result_df = pd.DataFrame(Result_set)
Result_df.sort_values("rmse_valid", axis = 0, ascending = True)

In [None]:
# Display result again
import matplotlib.pyplot as plt
for i in range(len(Model_list)):
    
    plt.plot(Result_set["rmse_valid"],label ='Valid Accuracies')
#plt.plot(len(Model_list))
plt.xlabel('Models')
#plt.legend()
plt.ylabel('RMSE Values')
x.set_xlim(1, 20)
plt.xticks(range(1, 21))
plt.title('RMSE Scores of Regression Models')
plt.show()
plt.savefig("Regrmodels.png")
        



In [None]:
# Prediction, use model vr

X = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

test_X = test_df[X]
test_X['Pawpularity'] = vr.predict(test_X)
test_X['Id'] = test_df['Id']

submission_vr = test_X[['Id', 'Pawpularity']]
submission_vr.to_csv("submission.csv", index = False)

print(submission_vr)