# PetFinder.my Adoption Prediction: Feature Engineering

### David Mora Garrido, Bachelor Dissertation (2nd part)

In [None]:
!pip install rfpimp

import category_encoders as ce
import glob
import json
import matplotlib.pyplot as plt
import math
import numpy as np
import os
import pandas as pd
import random
import rfpimp
import seaborn as sns
import string
import time
import utils_tfg_pet_adoption_eda as utils
import warnings
import xgboost as xgb

from collections import defaultdict
from IPython.display import clear_output
from itertools import cycle
# from scipy.stats import skew
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_selection import f_classif, mutual_info_classif, mutual_info_regression
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, StandardScaler, label_binarize
# from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm

pd.set_option('display.max_columns', None)
# plt.style.available

In [None]:
seed = 27912

In [None]:
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
breeds = pd.read_csv('../input/petfinder-adoption-prediction/PetFinder-BreedLabels.csv')
colors = pd.read_csv('../input/petfinder-adoption-prediction/PetFinder-ColorLabels.csv')
states = pd.read_csv('../input/petfinder-adoption-prediction/PetFinder-StateLabels.csv')

In [None]:
breeds_dict = {0: np.nan}
for index, row in breeds.iterrows():
    breeds_dict[row["BreedID"]] = row["BreedName"]
    
colors_dict = {0: np.nan}
for index, row in colors.iterrows():
    colors_dict[row["ColorID"]] = row["ColorName"]
    
states_dict = {}
for index, row in states.iterrows():
    states_dict[row["StateID"]] = row["StateName"]

In [None]:
target = "AdoptionSpeed"

In [None]:
X = train.drop(target, axis=1)
y = train[target]

In [None]:
class LeftJoinReplace(BaseEstimator, TransformerMixin):
    def __init__(self, values_dict, variables):
        self.values_dict = values_dict
        self.variables = variables
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            X[var].replace(self.values_dict, inplace=True)
        return X
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

In [None]:
# To be passed to FunctionTransformer
def replace_integers_by_strings(X):
    X = X.copy()
    replace_dict = {
        'Type': {1: 'Dog', 2: 'Cat'},
        'Gender': {1: 'Male', 2: 'Female', 3: 'Mixed'},
        'MaturitySize': {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large', 0: 'Not Specified'},
        'FurLength': {1: 'Short', 2: 'Medium', 3: 'Long', 0: 'Not Specified'},
        'Vaccinated': {1: 'Yes', 2: 'No', 3: 'Not Sure'},
        'Dewormed': {1: 'Yes', 2: 'No', 3: 'Not Sure'},
        'Sterilized': {1: 'Yes', 2: 'No', 3: 'Not Sure'},
        'Health': {1: 'Healthy', 2: 'Minor Injury', 3: 'Serious Injury', 0: 'Not Specified'}
    }
    utils.replace_val_categorical(X, replace_dict)
    return X

In [None]:
# To be passed to FunctionTransformer
def has_significant_name(X):
    X = X.copy()
    no_name_equivalents = set(filter(lambda x: 'no name' in str(x).lower() or len(str(x)) < 3
                                     or x is np.nan, list(X["Name"].unique())))
    X["HasName"] = 1
    X.loc[X["Name"].isin(no_name_equivalents), "HasName"] = 0
    return X #.drop(["Name"], axis=1)

In [None]:
class BreedFrequencyEncoding(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y):
        self.count_encoder = ce.CountEncoder()
        self.count_encoder.fit(X["Breed1"])
        self.most_common_value = X["Breed1"].value_counts().value_counts().keys()[0]
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X["Breed1_freq_encode"] = self.count_encoder.transform(X["Breed1"])
        X.loc[X["Breed1_freq_encode"].isna(), "Breed1_freq_encode"] = self.most_common_value
        return X #.drop(["Breed1"], axis=1)

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

    
class BreedTargetEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, minimum_group_threshold=1, smoothing=1.0):
        self.minimum_group_threshold = minimum_group_threshold
        self.smoothing = smoothing
        
    def fit(self, X, y):
        X = X.copy()
        y = y.copy().astype(str)
        self.fitted_target_encoders = {}
        y_encoder = ce.OneHotEncoder(use_cat_names=True)
        y_encoder.fit(y)
        y_onehot = y_encoder.transform(y)
        y_column_names = y_onehot.columns
        for class_name in y_column_names:
            target_encoder = ce.TargetEncoder(min_samples_leaf=self.minimum_group_threshold,
                                              smoothing=self.smoothing)
            self.fitted_target_encoders[class_name] = target_encoder.fit(X["Breed1"], y_onehot[class_name])
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for class_name, target_encoder in self.fitted_target_encoders.items():
            X[f'Breed1_{str(class_name)}'] = target_encoder.transform(X["Breed1"])
        return X #.drop(["Breed1"], axis=1)
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)
    

class BreedsOneHotSVD(BaseEstimator, TransformerMixin):
    def __init__(self, svd_n_components=10, seed=27912):
        self.svd_n_components = svd_n_components
        self.seed = seed
    
    def fit(self, X, y):
        self.encoder = ce.OneHotEncoder(use_cat_names=True)
        self.svd = TruncatedSVD(n_components=self.svd_n_components, random_state=self.seed)
        X_breeds_transformed = self.encoder.fit_transform(X[["Breed1", "Breed2"]])
        self.svd.fit(X_breeds_transformed)
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        X_breeds_transformed = self.encoder.transform(X[["Breed1", "Breed2"]])
        breed_svd_columns = [f"Breed_svd_{i}" for i in range(self.svd_n_components)]
        X_breeds_svd = pd.DataFrame(self.svd.transform(X_breeds_transformed),
                                    index=X.index.copy(),
                                    columns=breed_svd_columns)
        X = X.merge(X_breeds_svd, left_index=True, right_index=True, how="left")
        return X
    

class BreedEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, enc_type="target_and_frequency", minimum_group_threshold=1, smoothing=1.0):
        self.enc_type = enc_type
        self.minimum_group_threshold = minimum_group_threshold
        self.smoothing = smoothing
    
    def fit(self, X, y):
        self.encoders = []
        if self.enc_type == "target_and_frequency":
            self.encoders.append(BreedTargetEncoding(self.minimum_group_threshold,
                                                     self.smoothing))
            self.encoders.append(BreedFrequencyEncoding())
        elif self.enc_type == "one-hot_svd":
            self.encoders.append(BreedsOneHotSVD())
        else:
            raise ValueError(f"{self.enc_type} is not a valid value for 'enc_type' (target_and_frequency/one-hot_svd)")
            
        for encoder in self.encoders:
            encoder.fit(X,y)
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for encoder in self.encoders:
            X_mod = encoder.transform(X, y)
            X_columns = set(X.columns)
            added_columns = list(set(X_mod.columns) - X_columns)
            X = pd.merge(X, X_mod[added_columns], left_index=True, right_index=True)
        
        return X #.drop(["Breed1"], axis=1)
        
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

In [None]:
def has_pure_breed(X):
    X = X.copy()
    domestic_x_hair = {"Domestic Short Hair", "Domestic Medium Hair", "Domestic Long Hair"}
    X["PureBreed"] = 1
    X.loc[(X["Breed1"] == "Mixed Breed") | (X["Breed2"] == "Mixed Breed") |
          (X["Breed1"].isin(domestic_x_hair)) | (X["Breed2"].isin(domestic_x_hair)) |
          ((X["Breed1"].notna()) & (X["Breed2"].notna())), "PureBreed"] = 0
    return X

In [None]:
def breed_matches_fur_length(X):
    X = X.copy()
    domestic_x_hair = {"Domestic Short Hair", "Domestic Medium Hair", "Domestic Long Hair"}
    indexes_fur_length_inconsistencies = []
    for index, row in X.iterrows():
        fur_length = str(row["FurLength"]).lower()
        breed_name = str(row["Breed1"])
        if breed_name in domestic_x_hair and fur_length not in breed_name.lower():
            indexes_fur_length_inconsistencies.append(index)
    X["BreedMatchesFurLength"] = 1
    X.loc[indexes_fur_length_inconsistencies, "BreedMatchesFurLength"] = 0
    return X

In [None]:
class BreedImputer(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y):
        self.most_common_dog_breed = X.loc[X["Type"] == "Dog", "Breed1"].value_counts().keys()[0]
        self.most_common_cat_breed = X.loc[X["Type"] == "Cat", "Breed1"].value_counts().keys()[0]
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X.loc[(X["Breed1"].isna()) & (X["Breed2"].notna()) & (X["Type"] == "Dog"), "Breed1"] = \
            X.loc[(X["Breed1"].isna()) & (X["Breed2"].notna()) & (X["Type"] == "Dog"), "Breed2"]
        X.loc[(X["Breed1"].isna()) & (X["Breed2"].notna()) & (X["Type"] == "Cat"), "Breed1"] = \
            X.loc[(X["Breed1"].isna()) & (X["Breed2"].notna()) & (X["Type"] == "Cat"), "Breed2"]
        X.loc[(X["Breed1"].isna()) & (X["Breed2"].isna()) &
              (X["Type"] == "Dog"), "Breed1"] = self.most_common_dog_breed
        X.loc[(X["Breed1"].isna()) & (X["Breed2"].isna()) &
              (X["Type"] == "Cat"), "Breed1"] = self.most_common_cat_breed
        return X
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

In [None]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y):
        self.one_hot_encoder = ce.OneHotEncoder(use_cat_names=True)
        self.one_hot_encoder.fit(X[self.columns], y)
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        if "Type" in X.columns:
            X["Type"] = np.where(X["Type"] == "Cat", 0, 1)
        return X.merge(self.one_hot_encoder.transform(X[self.columns]), left_index=True, right_index=True)

In [None]:
class OrdinalVariableEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns, enc_type="ordinal", mapping=None):
        self.columns = columns
        self.enc_type = enc_type
        self.mapping = mapping
        
    
    def fit(self, X, y):
        if self.enc_type == "ordinal":
            self.encoder = ce.OrdinalEncoder(mapping=self.mapping)
        elif self.enc_type == "one-hot":
            self.encoder = ce.OneHotEncoder(use_cat_names=True)
        else:
            raise ValueError(f"{self.enc_type} is not a valid value for 'enc_type' (ordinal/onehot)")
            
        self.encoder.fit(X[self.columns], y)
        if self.enc_type == "ordinal":
            self.most_common_values = {}
            for i, column in enumerate(self.columns):
                most_common_value_column = list(filter(lambda x: x in self.mapping[i]["mapping"],
                                                       X[column].value_counts().keys()))[0]
                self.most_common_values[column] = float(self.mapping[i]["mapping"][most_common_value_column])
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        columns_encoding = self.encoder.transform(X[self.columns])
        if self.enc_type == "ordinal":
            for column in self.columns:
                # If it is Not Specified, then the ordinal condition is harder to hold,
                # so in those cases we set the most repeated value in trainig
                columns_encoding.loc[columns_encoding[column] == -1.0, column] = \
                    self.most_common_values[column]
            columns_encoding.rename(lambda x: f"{x}_ordinal", axis=1, inplace=True)
        X = X.merge(columns_encoding, left_index=True, right_index=True)
        return X

In [None]:
ordinal_vars_mapping = [
    {"col": "MaturitySize", "mapping": {1: 0, 2: 1, 3: 2, 4: 3}},
    {"col": "FurLength", "mapping": {1: 0, 2: 1, 3: 2}},
    {"col": "Health", "mapping": {1: 0, 2: 1, 3: 2}},
]
# X_train["Health"].value_counts().keys()
float(ordinal_vars_mapping[0]["mapping"][
    list(filter(lambda x: x in ordinal_vars_mapping[0]["mapping"],
                X["MaturitySize"].value_counts().keys()))[0]])

In [None]:
class ReplaceState(BaseEstimator, TransformerMixin):
    def __init__(self, gdp_per_capita, impute_nan_value):
        self.gdp_per_capita = gdp_per_capita
        self.impute_nan_value = impute_nan_value
        
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X["State"].replace({"Melaka": "Malacca", "Pulau Pinang": "Penang"}, inplace=True)
        X["StateGDP"] = pd.to_numeric(X["State"].replace(self.gdp_per_capita))
        X.loc[X["StateGDP"].isna(), "StateGDP"] = self.impute_nan_value
        return X

In [None]:
class ReplaceRescuerID(BaseEstimator, TransformerMixin):
 
    def fit(self, X, y):
        self.count_encoder = ce.CountEncoder()
        self.count_encoder.fit(X["RescuerID"])
        self.most_common_value = X["RescuerID"].value_counts().value_counts().keys()[0]
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X["RescuerCount"] = self.count_encoder.transform(X["RescuerID"])
        X.loc[X["RescuerCount"].isna(), "RescuerCount"] = self.most_common_value
        return X

In [None]:
class CustomDiscretizer(BaseEstimator, TransformerMixin):
    def __init__(self, bins_age=-1, quantity=True, fee=True, video_amt=True, photo_amt=True):
        self.bins_age = bins_age
        self.quantity = quantity
        self.fee = fee
        self.video_amt = video_amt
        self.photo_amt = photo_amt            
            
    def fit(self, X, y):
        if self.bins_age > 1:
            self.age_discretizer = KBinsDiscretizer(strategy='kmeans', encode='ordinal',
                                                    n_bins=self.bins_age)
            self.age_discretizer.fit(X["Age"].values.reshape(-1, 1))
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        if self.bins_age > 1:
            X["Age_disc"] = self.age_discretizer.transform(X["Age"].values.reshape(-1, 1))
            X["Age_disc"] = X["Age_disc"].astype('int64')
        if self.quantity:
            X.loc[X["Quantity"] <= 1, "Quantity_disc"] = 0
            X.loc[(X["Quantity"] >= 2) & (X["Quantity"] <= 6), "Quantity_disc"] = 1
            X.loc[X["Quantity"] >= 7, "Quantity_disc"] = 2
        if self.fee:
            X["HasFee"] = np.where(X["Fee"] > 0, 1, 0)
        if self.video_amt:
            X["HasVideo"] = np.where(X["VideoAmt"] > 0, 1, 0)
        if self.photo_amt:
            X["HasPhoto"] = np.where(X["PhotoAmt"] > 0, 1, 0)   
        return X
            

In [None]:
class ColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        return X.drop(self.columns, axis=1)

In [None]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_columns):
        self.numeric_columns = numeric_columns
        
    def fit(self, X, y):
        self.columns_to_standardize = self.numeric_columns + \
            list(filter(
            lambda x: ("Breed" in str(x) and "Fur" not in str(x) and 
                       "Pure" not in str(x)) \
                        or "_ordinal" in str(x) or "img_" in str(x) \
                        or "desc_" in str(x) or "_num" in str(x) \
                        or "_mean" in str(x) or "_sum" in str(x) \
                        or "_var" in str(x),
                        X.columns))
        self.means = {}
        self.stds = {}
        for column in self.columns_to_standardize:
            self.means[column] = X[column].mean()
            self.stds[column] = X[column].std()
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for column in self.columns_to_standardize:
            mean = self.means[column]
            std = self.stds[column]
            X[column] = (X[column] - mean) / std if std > 0 else 0
        return X

In [None]:
class PrintStep(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        display(X)
        display(X.info())
        return X.copy()
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

In [None]:
gdp_per_capita = {
    "Kuala Lumpur": 129472,
    "Labuan": 77798,
    "Penang": 55243,
    "Selangor": 54995,
    "Sarawak": 53358,
    "Malacca": 49172,
    "Negeri Sembilan": 45373,
    "Johor": 37342,
    "Pahang": 36474,
    "Perak": 31668,
    "Terengganu": 30933,
    "Perlis": 25656,
    "Sabah": 25326,
    "Kedah": 22412,
    "Kelantan": 14300
}

ordinal_vars_mapping = [
    {"col": "MaturitySize", "mapping": {"Small": 0, "Medium": 1, "Large": 2, "Extra Large": 3}},
    {"col": "FurLength", "mapping": {"Short": 0, "Medium": 1, "Long": 2}},
    {"col": "Health", "mapping": {"Healthy": 0, "Minor Injury": 1, "Serious Injury": 2}},
]


columns_to_be_removed_1 = ["Name", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                         "Vaccinated", "Dewormed", "Sterilized", "State", "RescuerID", "Description",
                         "PetID", "MaturitySize", "FurLength", "Health"]

numeric_columns_1 = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt", "StateGDP", "RescuerCount"]


pipeline_1_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency", minimum_group_threshold=1, smoothing=1.0)),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                                "Color3", "Vaccinated", "Dewormed", "Sterilized"])),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_1)),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_1))
]

pipeline = Pipeline(steps=pipeline_1_transformers)
result = pipeline.fit_transform(X,y)
# clear_output(wait=True)
display(result)

**average classification report -->** https://stackoverflow.com/a/42567557

**https://arxiv.org/pdf/2008.05756.pdf**

In [None]:
def plot_precision_recall_curves(ax, y_test, y_score):
    y_test = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
    n_classes = y_test.shape[1]
    # For each class
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
                                                            y_score[:, i])
        average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(),
        y_score.ravel())
    average_precision["micro"] = average_precision_score(y_test, y_score,
                                                         average="micro")
    # setup plot details
    colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

    f_scores = np.linspace(0.2, 0.8, num=4)
    lines = []
    labels = []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        l, = ax.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
        ax.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

    lines.append(l)
    labels.append('iso-f1 curves')
    l, = ax.plot(recall["micro"], precision["micro"], color='gold', lw=2)
    lines.append(l)
    labels.append('micro-average Precision-recall (area = {0:0.2f})'
                  ''.format(average_precision["micro"]))

    for i, color in zip(range(n_classes), colors):
        l, = ax.plot(recall[i], precision[i], color=color, lw=2)
        lines.append(l)
        labels.append('Precision-recall for class {0} (area = {1:0.2f})'
                      ''.format(i, average_precision[i]))

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Multi-class Precision-Recall curve')
    ax.legend(lines, labels, loc=(0.1, -.5), prop=dict(size=12))

    
def round_reg_predictions(y_pred, coefficients):
    y_pred = y_pred.copy()
    for i, predicted_value in enumerate(y_pred):
        if predicted_value < coefficients[0]:
            y_pred[i] = 0
        elif predicted_value < coefficients[1]:
            y_pred[i] = 1
        elif predicted_value < coefficients[2]:
            y_pred[i] = 2
        elif predicted_value < coefficients[3]:
            y_pred[i] = 3
        else:
            y_pred[i] = 4
    y_pred = y_pred.astype('int64')
    return y_pred


def evaluate_model(model, cv, X, y, model_type, display_results=True,
                   display_plots=True, coefficients=[0.5, 1.5, 2.5, 3.5]):
    orig_model = model
    fit_times = []
    if model_type == 'regression':
        rmse_values = []
    accuracy_scores = []
    kappa_scores = []
    confusion_matrices = []
    
    # Variables for average classification report
    original_class = []
    predicted_class = []

    if model_type == "classification":
        all_y_test = []
        all_y_score = None
    
    for train_index, test_index in cv.split(X, y):
        print(f"CV Iteration {len(fit_times)+1}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = clone(orig_model)
        start = time.time() 
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model.fit(X_train, y_train)
        end = time.time()
        
        fit_times.append(end-start)
        
        y_pred = model.predict(X_test)
        
        if model_type == 'regression':
            rmse_values.append(mean_squared_error(y_test, y_pred, squared=False))
            y_pred = round_reg_predictions(y_pred, coefficients)
        else:
            all_y_test = np.concatenate((all_y_test, y_test))
            if all_y_score is None:
                all_y_score = model.predict_proba(X_test)
            else:
                all_y_score = np.concatenate((all_y_score, model.predict_proba(X_test)))
        
        original_class.extend(y_test)
        predicted_class.extend(y_pred)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        
        kappa_scores.append(cohen_kappa_score(y_test, y_pred, weights='quadratic'))
        
        confusion_matrices.append(confusion_matrix(y_test, y_pred, normalize='true'))
    
    if display_results:
        print("-----------------RESULTS-----------------")
        print(f"Mean fit time: {np.mean(fit_times)} s")
        if model_type == 'regression':
            print("RMSE:", rmse_values)
            print("Average RMSE:", np.mean(rmse_values))
        print("\nAccuracy:", accuracy_scores)
        print("QWK:", kappa_scores)
        print("\nAverage accuracy:", np.mean(accuracy_scores))
        print("Average QWK:", np.mean(kappa_scores))
        print("\nAverage classification report:")
        print(classification_report(original_class, predicted_class)) 
        
        if display_plots:
            disp = ConfusionMatrixDisplay(confusion_matrix=np.mean(confusion_matrices, axis=0))
            plt.style.use('default')
            nrows = 1
            ncols = 1 if model_type == "regression" else 2
            figsize = (18,6) if model_type == "classification" else (6,5)
            _, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
            ax = ax.flatten() if model_type == "classification" else [ax]
            ax[0].set_title("\nAverage confusion matrix", fontsize='18')
            disp.plot(ax=ax[0])
            ax[0].grid(False)

            if model_type == "classification":
                plot_precision_recall_curves(ax[1], all_y_test, all_y_score)

            plt.show()
    
    if model_type == "classification":
        return np.mean(fit_times), np.mean(accuracy_scores), np.mean(kappa_scores)
    else:
        return np.mean(fit_times), np.mean(rmse_values), np.mean(accuracy_scores), np.mean(kappa_scores)

We will evaluate the extensions of the previous base pipeline in a 5-CV with 4 different models, just to have an estimation of how much performance improves and also in order to determine which models are better to be used in hyperparameter tuning. Moreover, as the CNN that we will see in later sections are trained on a single training-validation split, we will also evaluate the performance on that split:

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

**Why precision-recall instead of ROC in this case --> https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/**

https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score

https://scikit-learn.org/stable/modules/model_evaluation.html#precision-recall-f-measure-metrics

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve

https://buildmedia.readthedocs.org/media/pdf/xgboost/latest/xgboost.pdf **1.5.9 --> non-determinism... (see also https://stackoverflow.com/questions/65523909/what-features-of-xgboost-are-affected-by-seed-random-state)**

**(all the cells below need X_train_CNN, y_train_CNN, X_val_CNN and y_val_CNN, defined at the beginning of the Training + image features extraction section)**

In [None]:
xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

global_evaluation_results_1 = pd.DataFrame(
    [],
    columns=["Pipeline", "Model", "Average fit time", "Average accuracy",
             "Average QWK", "Single split accuracy", "Single split QWK"]
)

for model_desc, model in models.items():
    print(f"--------------------- MODEL: {model_desc} ---------------------")
    avg_fit_time, avg_acc, avg_qwk = evaluate_model(
        Pipeline(steps=pipeline_1_transformers + [('model', model)]),
        cv, X, y, model_type="classification")
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_1_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)
    
    global_evaluation_results_1 = global_evaluation_results_1.append({
        "Pipeline": 1,
        "Model": model_desc,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_acc,
        "Average QWK": avg_qwk,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

## Adding description metadata

In [None]:
def include_description_length(X):
    X = X.copy()
    X["DescriptionLength"] = X["Description"].fillna('').map(lambda x: len(str(x)) if str(x) else 0)
    return X

In [None]:
# path = '../input/tfg-pet-adoption-data/train_description_metadata.json'
# description_metadata_json = utils.load_json(path)
# description_metadata = pd.DataFrame.from_dict(description_metadata_json, orient='index')
# description_metadata.drop(["DescriptionNumEntities"], axis=1, inplace=True)
# description_metadata.rename(lambda x: x if x == "DescriptionLanguage" else x + "_num", axis=1, inplace=True)
# description_metadata.to_csv("train_description_metadata.csv")

description_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/train_description_metadata.csv",
    index_col=0)
description_metadata

In [None]:
class IncludeDescriptionMetadata(BaseEstimator, TransformerMixin):
    def __init__(self, description_metadata):
        self.description_metadata = description_metadata
        
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X = X.merge(self.description_metadata, left_index=False, right_index=True,
                    left_on="PetID", how="left")
        X.loc[
            (X["Description"].isna()) | (X["Description"] == ''),
            ["DescriptionScore_num", "DescriptionMagnitude_num", "DescriptionMaxSalience_num"]
        ] = 0.0
        return X

In [None]:
class CorrectDescriptionLanguage(BaseEstimator, TransformerMixin):
    def __init__(self, minimum_group_size=10):
        self.minimum_group_size = minimum_group_size
        self.languages_freq = {}
        
    def fit(self, X, y):
        self.languages_freq = dict(X["DescriptionLanguage"].value_counts())
        self.languages_freq["ms"] = X[(X["Description"].notna()) &
                                         (X["DescriptionLanguage"].isna())].shape[0]
        freq_other_languages = 0
        languages_to_remove = []
        for language, freq in self.languages_freq.items():
            if freq < self.minimum_group_size:
                freq_other_languages += freq
                languages_to_remove.append(language)
        for language in languages_to_remove:
            del self.languages_freq[language]
        self.languages_freq["others"] = freq_other_languages
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X.loc[(X["Description"].notna()) & (X["DescriptionLanguage"].isna()),
              "DescriptionLanguage"] = "ms" # ISO code for Malay language
        X.loc[~(X["DescriptionLanguage"].isin(self.languages_freq)) &
              (X["Description"].notna()) &
              (X["Description"] != ''), "DescriptionLanguage"] = "others"
        return X

Is it worth to impute missing values (Malay descriptions) using a strategy like KNN or iterative imputing using a decision tree, given that there are more than 500 such cases and some of these description metadata's variables are skewed, it could be worth. We can try different strategies, also using the mean, and see how convenient each one could be.

In [None]:
class CustomIterativeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, estimator=DecisionTreeRegressor(max_depth=5, random_state=seed)):
        self.estimator = estimator
        
        
    def fit(self, X, y):
        self.estimator = clone(self.estimator)
        self.imputer = IterativeImputer(estimator=self.estimator,
                                        skip_complete=True,
                                        random_state=seed)
        self.columns_to_impute = list(filter(
            lambda x: "img_" not in str(x) and "desc_" not in str(x), X.columns))
        self.imputer.fit(X[self.columns_to_impute])
        return self
    
    
    def transform(self, X, y=None):
        X = X.copy()
        X[self.columns_to_impute] = self.imputer.transform(X[self.columns_to_impute])
        return X

In [None]:
columns_to_be_removed_2 = ["Name", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                         "Vaccinated", "Dewormed", "Sterilized", "State", "RescuerID", "PetID",
                        "MaturitySize", "FurLength", "Health", "DescriptionLanguage", "Description"]

numeric_columns_2 = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt", "StateGDP", "RescuerCount",
                   "DescriptionLength"]

pipeline_2_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_2)),
    ('impute_malay_desc', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_2)),
]

pipeline = Pipeline(steps=pipeline_2_transformers)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = pipeline.fit_transform(X,y)
display(result)

In [None]:
xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

global_evaluation_results_2 = pd.DataFrame(
    [],
    columns=["Pipeline", "Model", "Average fit time", "Average accuracy",
             "Average QWK", "Single split accuracy", "Single split QWK"]
)

for model_desc, model in models.items():
    print(f"--------------------- MODEL: {model_desc} ---------------------")
    avg_fit_time, avg_acc, avg_qwk = evaluate_model(
        Pipeline(steps=pipeline_2_transformers + [('model', model)]),
        cv, X, y, model_type="classification")
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_2_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)
    
    global_evaluation_results_2 = global_evaluation_results_2.append({
        "Pipeline": 2,
        "Model": model_desc,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_acc,
        "Average QWK": avg_qwk,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

## Adding profile image metadata

In [None]:
# path = '../input/tfg-pet-adoption-data/train_profile_images_metadata.json'
# profile_image_metadata_json = utils.load_json(path)
# profile_image_metadata = pd.DataFrame.from_dict(profile_image_metadata_json, orient='index')
# profile_image_metadata.drop(["faces"], axis=1, inplace=True)
# profile_image_metadata.rename(columns={
#     'sum_pixelFraction':'ImageMetadataSumPixelFraction_num',
#     'max_pet_topicality':'ImageMetadataMaxPetTopicality_num',
#     'num_entities':'ImageMetadataNumEntities_num',
#     'desc_concatenation':'ImageMetadataDescription',
#     'has_text':'ImageMetadataHasText'}, inplace=True)
# profile_image_metadata.to_csv("train_profile_images_metadata.csv")

profile_image_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/train_profile_images_metadata.csv",
    index_col=0)
profile_image_metadata

In [None]:
# path = "../input/tfg-pet-adoption-data/train_all_images_metadata.json"
# all_train_images_metadata_json = utils.load_json(path)
# all_train_images_metadata = pd.DataFrame.from_dict(all_train_images_metadata_json, orient='index')
# all_train_images_metadata.drop(["faces"], axis=1, inplace=True)
# all_train_images_metadata.rename(columns={
#     'sum_pixelFraction':'ImageMetadataSumPixelFraction',
#     'max_pet_topicality':'ImageMetadataMaxPetTopicality',
#     'num_entities':'ImageMetadataNumEntities',
#     'desc_concatenation':'ImageMetadataDescription',
#     'has_text':'ImageMetadataHasText'}, inplace=True)
# all_train_images_metadata.to_csv("all_train_images_metadata.csv")

all_images_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_train_images_metadata.csv",
    index_col=0)
all_images_metadata

In [None]:
def aggregate_images_metadata(all_images_metadata, type_data="train"):
    all_images_metadata = all_images_metadata.reset_index()
    all_images_metadata["ImageMetadataHasText"] = all_images_metadata[
        "ImageMetadataHasText"].astype('int64')
    all_images_metadata.rename(columns={"index": "PetID"}, inplace=True)
    all_images_metadata["PetID"] = all_images_metadata[
        "PetID"].apply(lambda x: str(x)[:str(x).rindex('-')]) #filename to PetID
    
    # Split numbers, boolean (ImageHasText) and text (ImageDescription)
    all_images_metadata_num = all_images_metadata[
        ["PetID", "ImageMetadataSumPixelFraction", "ImageMetadataMaxPetTopicality",
         "ImageMetadataNumEntities"]
    ].copy()
    all_images_metadata_bool = all_images_metadata[["PetID", "ImageMetadataHasText"]].copy()
    all_images_metadata_text = all_images_metadata[
        ["PetID", "ImageMetadataDescription"]].copy()
    all_images_metadata_text["ImageMetadataDescription"] = \
        all_images_metadata_text["ImageMetadataDescription"].astype(str)
    
    # Aggregate numbers
    all_images_metadata_num_agg = all_images_metadata_num.groupby(
        ["PetID"]).agg(["mean", "sum", "var"])
    all_images_metadata_num_agg.columns = \
        [f"{x}_{y}" if y != "" else f"{x}" for x,y in all_images_metadata_num_agg.columns.to_flat_index()]
    all_images_metadata_num_agg.fillna(0, inplace=True)
    
    # Aggregate booleans (sum)
    all_images_metadata_bool_agg = all_images_metadata_bool.groupby(
        ["PetID"]).sum()
    
    # Aggregate texts
    all_images_metadata_text_agg = \
        all_images_metadata_text.groupby(["PetID"])["ImageMetadataDescription"].apply(lambda x: " ".join(x))
    
    all_images_metadata_agg = all_images_metadata_num_agg.merge(
        all_images_metadata_text_agg, left_index=True, right_index=True)
    all_images_metadata_agg = all_images_metadata_agg.merge(
        all_images_metadata_bool_agg, left_index=True, right_index=True)
    
    all_images_metadata_agg.rename(columns={"ImageMetadataHasText": "ImageMetadataHasText_sum"},
                                   inplace=True)
    all_images_metadata_agg.to_csv(f"all_{type_data}_images_metadata_agg.csv")
    
    return all_images_metadata_agg

In [None]:
# all_train_images_metadata_text_agg = aggregate_images_metadata(all_train_images_metadata)
all_images_metadata_text_agg = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_train_images_metadata_agg.csv",
    index_col=0)
all_images_metadata_text_agg

In [None]:
class IncludeProfileImageMetadata(BaseEstimator, TransformerMixin):
    def __init__(self, profile_image_metadata, all_images_metadata_agg=None,
                 aggregate_metadata=False):
        self.profile_image_metadata = profile_image_metadata
        self.all_images_metadata_agg = all_images_metadata_agg
        self.aggregate_metadata = aggregate_metadata
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        if self.aggregate_metadata:
            metadata = self.all_images_metadata_agg
        else:
            metadata = self.profile_image_metadata
        X = X.merge(metadata, left_index=False, right_index=True,
                    left_on="PetID", how="left")
        columns = list(filter(lambda x: "ImageMetadata" in x and x != "ImageMetadataDescription", X.columns))
        X.loc[(X["PhotoAmt"] == 0), columns] = 0
        X.loc[(X["PhotoAmt"] == 0), "ImageMetadataDescription"] = ''
        return X

In [None]:
class CorrectWrongType(BaseEstimator, TransformerMixin):
    def __init__(self, breeds_file,
                 cat_equivalents={'cat', 'kitty', 'kitten', 'cats', 'kitties', 'kittens'},
                 dog_equivalents={'dog', 'doggie', 'doggy', 'doggo', 'pup', 'puppy',
                                 'dogs', 'pups', 'puppies'},
                 delete_characters_desc=set(string.punctuation)):
        
        self.breeds_file = breeds_file
        self.cat_equivalents = cat_equivalents
        self.dog_equivalents = dog_equivalents
        self.delete_characters_desc = delete_characters_desc
       
    
    def fit(self, X, y):
        self.breeds_file = self.breeds_file.copy()
        utils.replace_val_categorical(self.breeds_file,
                                      replace_dict={'Type': {1: 'Dog', 2: 'Cat'}})
        self.most_common_breed = {}
        self.most_common_breed["Cat"] = X.loc[X["Type"] == "Cat",
                                              "Breed1"].value_counts().keys()[0]
        self.most_common_breed["Dog"] = X.loc[X["Type"] == "Dog",
                                              "Breed1"].value_counts().keys()[0]
        return self
    
    
    def transform(self, X, y=None):
        X = X.copy()
        for index, row in X[(X["ImageMetadataDescription"].notna()) |
                            (X["ImageMetadataDescription"] != '')].iterrows():
            if str(row["Type"]).lower() not in str(row["ImageMetadataDescription"]).lower() \
                and (("ImageMetadataMaxPetTopicality" in X.columns and
                      row["ImageMetadataMaxPetTopicality"] > 0) or 
                     ("ImageMetadataMaxPetTopicality_mean" in X.columns and
                      row["ImageMetadataMaxPetTopicality_mean"] > 0)):
                
                type_from_breed = self.breeds_file.loc[
                        self.breeds_file["BreedName"] == row["Breed1"], "Type"].values[0]
                name_words = set(str(row["Name"]).lower().split())
                cat_equivalent_in_name = len(self.cat_equivalents & name_words) > 0
                dog_equivalent_in_name = len(self.dog_equivalents & name_words) > 0
                
                # Check if a 'cat' or 'dog' equivalent is in the profile name
                if cat_equivalent_in_name:
                    X.loc[index, "Type"] = "Cat"
                elif dog_equivalent_in_name:
                    X.loc[index, "Type"] = "Dog"
                else:
                    # Check if a 'cat' or 'dog' equivalent is in the profile description
                    # Delete punctuation characters in order to find type equivalents
                    # ('cat' not found if we leave 'cat.' in the description, for example)
                    description = str(row["Description"]).lower()
                    for char in self.delete_characters_desc:
                        description = description.replace(char, ' ')
                    description_words = description.split()
                    num_cat_equivalents = sum(list(map(lambda x: description_words.count(x),
                                                       self.cat_equivalents)))
                    num_dog_equivalents = sum(list(map(lambda x: description_words.count(x),
                                                       self.dog_equivalents)))
                    
                    if num_cat_equivalents > num_dog_equivalents:
                        X.loc[index, "Type"] = "Cat"
                    elif num_dog_equivalents > num_cat_equivalents:
                        X.loc[index, "Type"] = "Dog"
                    # Get the Type from the external breeds file
                    elif type_from_breed == row["Type"]:
                        X.loc[index, "Type"] = "Cat"
                    else:
                        X.loc[index, "Type"] = type_from_breed

                if type_from_breed != X.loc[index, "Type"]:
                    X.loc[index, "Breed1"] = self.most_common_breed[X.loc[index, "Type"]]
        
        return X

In [None]:
columns_to_be_removed_3 = ["Name", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                         "Vaccinated", "Dewormed", "Sterilized", "State", "RescuerID", "PetID",
                         "MaturitySize", "FurLength", "Health", "DescriptionLanguage",
                         "Description", "ImageMetadataDescription"]

numeric_columns_3 = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt", "StateGDP", "RescuerCount",
                   "DescriptionLength"]

pipeline_3_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_3)),
    ('impute_malay_desc', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_3))
]

pipeline = Pipeline(steps=pipeline_3_transformers)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = pipeline.fit_transform(X,y)
display(result)

In [None]:
xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

global_evaluation_results_3 = pd.DataFrame(
    [],
    columns=["Pipeline", "Model", "Average fit time", "Average accuracy",
             "Average QWK", "Single split accuracy", "Single split QWK"]
)

for model_desc, model in models.items():
    print(f"--------------------- MODEL: {model_desc} ---------------------")
    avg_fit_time, avg_acc, avg_qwk = evaluate_model(
        Pipeline(steps=pipeline_3_transformers + [('model', model)]),
        cv, X, y, model_type="classification")
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_3_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)
    
    global_evaluation_results_3 = global_evaluation_results_3.append({
        "Pipeline": 3,
        "Model": model_desc,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_acc,
        "Average QWK": avg_qwk,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

## Adding profile image properties

In [None]:
# path = '../input/tfg-pet-adoption-data/train_profile_images_properties.json'
# profile_image_properties_json = utils.load_json(path)
# profile_image_properties = pd.DataFrame.from_dict(profile_image_properties_json, orient='index')
# profile_image_properties.rename(columns={
#     "dullness": "ImagePropertyDullness_num",
#     "whiteness": "ImagePropertyWhiteness_num",
#     "blurrness": "ImagePropertyBlurrness_num",
#     "size": "ImagePropertySize_num",
#     "width": "ImagePropertyWidth_num",
#     "height": "ImagePropertyHeight_num"
# }, inplace=True)
# profile_image_properties.to_csv("train_profile_images_properties.csv")

profile_image_properties = pd.read_csv(
    "../input/tfg-pet-adoption-data/train_profile_images_properties.csv",
    index_col=0)
profile_image_properties

In [None]:
# path = '../input/tfg-pet-adoption-data/all_train_images_properties.json'
# all_images_properties_json = utils.load_json(path)
# all_images_properties = pd.DataFrame.from_dict(all_images_properties_json, orient='index')
# all_images_properties.drop(["PetID"], axis=1, inplace=True)
# all_images_properties.rename(columns={
#     "dullness": "ImagePropertyDullness",
#     "whiteness": "ImagePropertyWhiteness",
#     "blurrness": "ImagePropertyBlurrness",
#     "size": "ImagePropertySize",
#     "width": "ImagePropertyWidth",
#     "height": "ImagePropertyHeight"
# }, inplace=True)
# rounded_width = round(all_images_properties["ImagePropertyWidth"]/100)*100
# rounded_height = round(all_images_properties["ImagePropertyHeight"]/100)*100
# all_images_properties["ImagePropertyAspectRatio"] = np.divide(rounded_width, rounded_height, out=np.zeros_like(rounded_width), where=rounded_height!=0)
# all_images_properties.to_csv("all_train_images_properties.csv")

all_images_properties = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_train_images_properties.csv",
    index_col=0)
all_images_properties

In [None]:
def aggregate_images_properties(all_images_properties, type_data="train"):
    all_images_properties = all_images_properties.reset_index()
    all_images_properties.rename(columns={"index": "PetID"}, inplace=True)
    all_images_properties["PetID"] = all_images_properties[
        "PetID"].apply(lambda x: str(x)[:str(x).rindex('-')]) #filename to PetID
    
    all_images_properties_agg = all_images_properties.groupby(
        ["PetID"]).agg(["mean", "sum", "var"])
    all_images_properties_agg.columns = \
        [f"{x}_{y}" if y != "" else f"{x}" for x,y in all_images_properties_agg.columns.to_flat_index()]
    all_images_properties_agg.fillna(0, inplace=True)
    
    all_images_properties_agg.to_csv(f"all_{type_data}_images_properties_agg.csv")
    
    return all_images_properties_agg

In [None]:
# all_images_properties_agg = aggregate_images_properties(all_images_properties)
all_images_properties_agg = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_train_images_properties_agg.csv",
    index_col=0)
all_images_properties_agg

In [None]:
class IncludeProfileImageProperties(BaseEstimator, TransformerMixin):
    def __init__(self, profile_image_properties,
                 aggregated_images_properties=None,
                 aggregated_properties=False):
        self.profile_image_properties = profile_image_properties
        self.aggregated_images_properties = aggregated_images_properties
        self.aggregated_properties = aggregated_properties
        
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        if self.aggregated_properties:
            properties = self.aggregated_images_properties
        else:
            properties = self.profile_image_properties
        X = X.merge(properties, left_index=False, right_index=True,
                    left_on="PetID", how="left")
        columns_nan = list(filter(lambda x: "ImageProperty" in x and (
            "Dullness" in x or "Whiteness" in x or "Blurrness" in x), X.columns))
        columns_zero = list(filter(lambda x: "ImageProperty" in x and (
            "Size" in x or "Width" in x or "Height" in x or "AspectRatio" in x),
                                   X.columns))
        X.loc[(X["PhotoAmt"] == 0), columns_nan] = np.nan
        X.loc[(X["PhotoAmt"] == 0), columns_zero] = 0
        return X

In [None]:
def include_aspect_ratio(X):
    X = X.copy()
    if "ImagePropertyAspectRatio_mean" not in X.columns:
        rounded_width = round(X["ImagePropertyWidth_num"]/100)*100
        rounded_height = round(X["ImagePropertyHeight_num"]/100)*100
        X["ImagePropertyAspectRatio_num"] = np.divide(
            rounded_width, rounded_height,
            out=np.zeros_like(rounded_width),
            where=rounded_height!=0)
    return X

In [None]:
columns_to_be_removed_4 = ["Name", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                         "Vaccinated", "Dewormed", "Sterilized", "State", "RescuerID", "PetID",
                         "MaturitySize", "FurLength", "Health", "DescriptionLanguage",
                         "Description", "ImageMetadataDescription"]

numeric_columns_4 = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt", "StateGDP", "RescuerCount",
                   "DescriptionLength"]
    
pipeline_4_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_4)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_4))
]

pipeline = Pipeline(steps=pipeline_4_transformers)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = pipeline.fit_transform(X,y)
display(result)

In [None]:
xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

global_evaluation_results_4 = pd.DataFrame(
    [],
    columns=["Pipeline", "Model", "Average fit time", "Average accuracy",
             "Average QWK", "Single split accuracy", "Single split QWK"]
)

for model_desc, model in models.items():
    print(f"--------------------- MODEL: {model_desc} ---------------------")
    avg_fit_time, avg_acc, avg_qwk = evaluate_model(
        Pipeline(steps=pipeline_4_transformers + [('model', model)]),
        cv, X, y, model_type="classification")
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_4_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)
    
    global_evaluation_results_4 = global_evaluation_results_4.append({
        "Pipeline": 4,
        "Model": model_desc,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_acc,
        "Average QWK": avg_qwk,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

In [None]:
display(global_evaluation_results_1)
display(global_evaluation_results_2)
display(global_evaluation_results_3)
display(global_evaluation_results_4)

In [None]:
# _, ax = plt.subplots(figsize=(15,15))
# xgb.plot_importance(model, ax=ax)

https://stats.stackexchange.com/questions/421545/multiple-imputation-by-chained-equations-mice-explained#:~:text=MICE%20is%20a%20multiple%20imputation,are%20missing%20completely%20at%20random).

https://cran.r-project.org/web/packages/miceRanger/vignettes/miceAlgorithm.html

https://contrib.scikit-learn.org/category_encoders/targetencoder.html (https://contrib.scikit-learn.org/category_encoders/_modules/category_encoders/target_encoder.html#TargetEncoder.fit) y https://dl.acm.org/citation.cfm?id=507538

https://www.kaggle.com/aroraaman/quadratic-kappa-metric-explained-in-5-simple-steps

## Extracting features from the images using CNN

**Base source code to extract image features from: https://www.kaggle.com/christofhenkel/extract-image-features-from-pretrained-nn**

In [None]:
import cv2
import keras.backend as K
import shutil
import tensorflow as tf

from IPython.display import Image

from keras.applications import DenseNet121, DenseNet201, InceptionResNetV2, NASNetLarge, VGG16, Xception
from keras.applications.densenet import preprocess_input as preprocess_input_densenet
from keras.applications.inception_resnet_v2 import preprocess_input as preprocess_input_inceptionresnetv2
from keras.applications.nasnet import preprocess_input as preprocess_input_nasnet
from keras.applications.vgg16 import preprocess_input as preprocess_input_vgg16
from keras.applications.xception import preprocess_input as preprocess_input_xception
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Add, Average, AveragePooling1D, Concatenate, Dense, Dropout, GlobalAveragePooling2D, Input, Lambda, Maximum
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.preprocessing.image import array_to_img, ImageDataGenerator
from keras.utils import to_categorical, plot_model

from sklearn.model_selection import train_test_split
# from sklearn.utils.validation import check_is_fitted

!pip install tensorflow-addons
from tensorflow_addons.metrics import CohenKappa as cohen_kappa_keras

In [None]:
class ImageFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, construct_from_cnn_backbone=None, from_image=None, cnn_backbone=None,
                 images_directory=None, preprocess_input=None, img_size=None, average=False,
                 cnn_backbone_weights_file=None, input_features_file=None, model=None,
                 save=False, model_name=None, svd_n_components=None, include_feats=True,
                 loaded_features=None, multiple_instances_per_petid=False,
                 use_utils_only=False, debug=False):
        self.construct_from_cnn_backbone = construct_from_cnn_backbone
        self.from_image = from_image
        self.cnn_backbone = cnn_backbone
        self.images_directory = images_directory
        self.preprocess_input = preprocess_input
        self.img_size = img_size
        self.average = average
        self.cnn_backbone_weights_file = cnn_backbone_weights_file
        self.input_features_file = input_features_file
        self.model = model
        self.save = save
        self.model_name = model_name
        self.svd_n_components = svd_n_components
        self.include_feats = include_feats
        self.loaded_features = loaded_features
        self.multiple_instances_per_petid = multiple_instances_per_petid
        self.use_utils_only = use_utils_only
        self.debug = debug
                
        
    def resize_to_square(self, im):
        old_size = im.shape[:2] # old_size is in (height, width) format
        ratio = float(self.img_size)/max(old_size)
        new_size = tuple([int(x*ratio) for x in old_size])
        # new_size should be in (width, height) format
        im = cv2.resize(im, (new_size[1], new_size[0]))
        delta_w = self.img_size - new_size[1]
        delta_h = self.img_size - new_size[0]
        top, bottom = delta_h//2, delta_h-(delta_h//2)
        left, right = delta_w//2, delta_w-(delta_w//2)
        color = [0, 0, 0]
        new_im = cv2.copyMakeBorder(im, top, bottom, left, right,
                                    cv2.BORDER_CONSTANT, value=color)
        return new_im

    
    def load_image(self, pet_id, num_photo=None):
        image = cv2.imread(f'{self.images_directory}/{pet_id}-{1 if num_photo is None else num_photo}.jpg')
        new_image = self.resize_to_square(image)
        new_image = new_image[...,::-1].astype(np.float32) # BGR to RGB + uint8 to float32
        new_image = self.preprocess_input(new_image)
        return new_image
    
    
    def create_model(self):
        inp = Input((self.img_size,self.img_size,3))
        if self.cnn_backbone_weights_file is not None:
            weights = self.cnn_backbone_weights_file
        else:
            weights = "imagenet"
        backbone = self.cnn_backbone(input_tensor=inp, include_top=False, weights=weights)
        x = backbone.output
        x = GlobalAveragePooling2D()(x)
        if self.average:
            x = Lambda(lambda x: K.expand_dims(x, axis=-1))(x)
            x = AveragePooling1D(4)(x)
            out = Lambda(lambda x: x[:,:,0])(x)
        else:
            out = x
            
        self.model = Model(inp,out)
    
    
    def get_input_features(self, pet_id, num_photo=None):
        return self.input_features_df.loc[f"{pet_id}{'-'+str(num_photo)+'.jpg' if num_photo is not None else ''}", :]
    
    
    def extract(self, pet_ids_or_filenames, input_type):
        if input_type == "features":
            self.input_features_df = pd.read_csv(self.input_features_file, index_col=0)
            get_input = self.get_input_features
        elif input_type == "image":
            get_input = self.load_image
        
        batch_size = 16
        n_batches = len(pet_ids_or_filenames) // batch_size + 1
        features = {}
        for b in tqdm(range(n_batches)):
            start = b * batch_size
            end = (b+1) * batch_size
            batch_pets = pet_ids_or_filenames[start:end]
            if input_type == "image":
                batch_inputs = np.zeros((len(batch_pets), self.img_size, self.img_size, 3))
            elif input_type == "features":
                batch_inputs = np.zeros((len(batch_pets), self.input_features_df.shape[1]))
            
            for i, pet_id_or_filename in enumerate(batch_pets):
                try:
                    if self.multiple_instances_per_petid:
                        split_index = pet_id_or_filename.rindex('-')
                        pet_id = pet_id_or_filename[:split_index]
                        num_photo = pet_id_or_filename[split_index + 1:-4] #-4 to not include .jpg
                        batch_inputs[i] = get_input(pet_id, num_photo)
                    else:
                        batch_inputs[i] = get_input(pet_id_or_filename)
                except:
                    pass
            
            batch_preds = self.model.predict(batch_inputs)
            for i, pet_id_or_filename in enumerate(batch_pets):
                features[pet_id_or_filename] = batch_preds[i]
        
        return pd.DataFrame.from_dict(features, orient='index')
        
          
    def select_method_and_extract(self, pet_ids_or_filenames):
        if self.loaded_features is not None:
            features_df = self.loaded_features.copy()
        # Only the above option is recommended when this method is called from self.fit and
        # we are doing cross validation, grid search, etc. The options below should only be
        # executed if we have not extracted and saved features before
        elif self.construct_from_cnn_backbone:
            features_df = self.extract(pet_ids_or_filenames, input_type="image")
        elif self.model is not None:
            if self.from_image:
                features_df = self.extract(pet_ids_or_filenames, input_type="image")
            else:
                features_df = self.extract(pet_ids_or_filenames, input_type="features")
        else:
            features_df = pd.read_csv(self.input_features_file, index_col=0)
            
        return features_df
    
    
    def get_filenames(self, pet_ids, photo_amts):
        filenames = []
        for pet_id, photo_amt in zip(pet_ids, photo_amts):
            filenames.append(f"{pet_id}-1.jpg")
            for i in range(2, int(photo_amt)+1):
                filenames.append(f"{pet_id}-{i}.jpg")
        return filenames
    
    
    def fit(self, X, y):
        if not self.use_utils_only:
            if self.construct_from_cnn_backbone is None:
                raise ValueError("""'construct_from_cnn_backbone' cannot be None if this
                            transformer is not used just for its utils""")
            if self.construct_from_cnn_backbone:
                if self.cnn_backbone is None or self.images_directory is None \
                    or self.preprocess_input is None or self.img_size is None:
                    raise ValueError("""Not enough parameters given that
                            'construct_from_cnn_backbone' is True""")
            else:
                if self.model is None and (self.input_features_file is None and
                                           self.loaded_features is None):
                    raise ValueError("""If you don't specify 'model', either 'input_features_file'
                            or 'loaded_features' must be passed as parameter""")
                if self.from_image and (self.model is None or self.images_directory is None
                                        or self.preprocess_input is None or self.img_size is None):
                    raise ValueError("""Not enough parameters given that 'construct_from
                            _cnn_backbone' is False and 'from_image' is True""")
                if self.model is not None and self.input_features_file is None and not self.from_image:
                    raise ValueError("""'input_features_file' cannot be None given that 'construct_from
                            _cnn_backbone' is False and 'from_image' is False""")
                if self.save and self.model is not None and self.model_name is None:
                    raise ValueError("""You have to specify the name of the model if you want to store
                                the features""")
        else:
            raise RuntimeError("Cannot fit transformer as only its utils can be used")
            
        if self.construct_from_cnn_backbone:
            self.create_model()
            
        if self.svd_n_components is not None:
            self.svd = TruncatedSVD(n_components=self.svd_n_components,
                                    random_state=seed)
            pet_ids = X["PetID"].values
            
            if self.multiple_instances_per_petid:
                photo_amts = X["PhotoAmt"].values
                filenames = self.get_filenames(pet_ids, photo_amts)
                pet_ids_or_filenames = filenames
            else:
                pet_ids_or_filenames = pet_ids
            
            features_df = self.select_method_and_extract(pet_ids_or_filenames)
            self.svd.fit(features_df.loc[pet_ids_or_filenames, :])
        
        return self
    
    
    def save_features(self, features_df, prefix=None):
        svd_str = f"_svd-{self.svd.components_.shape[0]}" if self.svd_n_components is not None else ""
        if self.construct_from_cnn_backbone:
            average_str = "_avg" if self.average else ""
            new_filename = f'{prefix + "_" if prefix is not None else ""}image_features_{self.cnn_backbone.__name__}_in-{self.img_size}{average_str}{svd_str}.csv'
        elif self.model is not None:
            new_filename = f'{prefix + "_" if prefix is not None else ""}image_features_{self.model_name}_in-{self.img_size}{svd_str}.csv'
        elif svd_str != "":
            new_filename = f'{self.input_features_file[self.input_features_file.rindex("/")+1:-4]}{svd_str}.csv'

        try:
            features_df.to_csv(new_filename)
            if self.debug:
                print(f"Image features file {new_filename} succesfully saved")
        except Exception as e:
            print(e)
    
    
    def transform(self, X, y=None):
        if self.use_utils_only:
            raise RuntimeError("Cannot use transformer as only its utils can be used")
        X = X.copy()
        pet_ids = X["PetID"].values
        
        if self.multiple_instances_per_petid:
            photo_amts = X["PhotoAmt"].values
            filenames = self.get_filenames(pet_ids, photo_amts)
            pet_ids_or_filenames = filenames
        else:
            pet_ids_or_filenames = pet_ids
        
        features_df = self.select_method_and_extract(pet_ids_or_filenames)

        if self.svd_n_components is not None and features_df is not None:
            features_df = pd.DataFrame(self.svd.transform(features_df.loc[pet_ids_or_filenames, :]), index=pet_ids_or_filenames)
        
        if self.debug:
            print(f"Number of image features: {features_df.shape[1]}\n")
            display(features_df.head())
            
        if self.multiple_instances_per_petid:
            if self.save:
                self.save_features(features_df, prefix="ALL")
            features_df = features_df.reset_index()
            features_df.rename(columns={"index": "PetID"}, inplace=True)
            features_df["PetID"] = features_df["PetID"].apply(lambda x: str(x)[:str(x).rindex('-')]) #filename to PetID
            features_df = features_df.groupby(["PetID"]).agg(["mean", "sum", "var"])
            features_df.columns = [f"{x}_{y}" if y != "" else f"{x}" for x,y in features_df.columns.to_flat_index()]
            features_df.fillna(0, inplace=True)
            if self.debug:
                print(f"Aggregated features:\n")
                display(features_df.head())
            if self.save:
                self.save_features(features_df, prefix="AGGREGATED")
        elif self.save:
            self.save_features(features_df)
            
        if self.include_feats:
            features_df.rename(lambda x: f"img_{x}", axis=1, inplace=True)
            X = X.merge(features_df, left_index=False, right_index=True,
                        left_on="PetID", how="left")
#             if "PetID" in X.columns:
#                 X.drop(["PetID"], axis=1, inplace=True)
                
        return X

**The code to load an image from https://www.kaggle.com/christofhenkel/extract-image-features-from-pretrained-nn is not correct, since cv2 loads images in BGR format but preprocess_input expects them to be in RGB, plus the type should be float32 and cv2 loads uint8. See https://github.com/keras-team/keras/blob/be4cef42ab21d85398fb6930ec5419a3de8a7d71/keras/applications/densenet.py#L364, https://github.com/keras-team/keras/blob/be4cef42ab21d85398fb6930ec5419a3de8a7d71/keras/applications/imagenet_utils.py#L166, https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/img_to_array, https://stackoverflow.com/questions/50746096/how-to-match-cv2-imread-to-the-keras-image-img-load-output, https://github.com/keras-team/keras/issues/10279 and https://stackoverflow.com/questions/48677128/what-is-the-right-way-to-preprocess-images-in-keras-while-fine-tuning-pre-traine.**

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=DenseNet121,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_densenet, img_size=256)


greatest_aspect_ratios_ids = X.loc[result.sort_values('ImagePropertyAspectRatio_num',
                                    ascending=False).head(5).index, "PetID"].values
smallest_aspect_ratios_ids = X.loc[result.loc[result["PhotoAmt"] > result["PhotoAmt"].min()
                        ].sort_values('ImagePropertyAspectRatio_num').head(5).index, "PetID"].values

_, ax = plt.subplots(nrows=1,ncols=5,figsize=(20,4))
ax = ax.flatten()
for i, pet_id in enumerate(greatest_aspect_ratios_ids):
    image = cv2.imread(f'{ife.images_directory}/{pet_id}-1.jpg')
    new_image = ife.resize_to_square(image)
    new_image = new_image[...,::-1] # BGR to RGB
    ax[i].set_xticks([]) 
    ax[i].set_yticks([]) 
    ax[i].imshow(new_image)

plt.suptitle("Resized images that had the greatest aspect ratios", fontsize=20)
plt.show()


_, ax = plt.subplots(nrows=1,ncols=5,figsize=(20,4))
ax = ax.flatten()
for i, pet_id in enumerate(smallest_aspect_ratios_ids):
    image = cv2.imread(f'{ife.images_directory}/{pet_id}-1.jpg')
    new_image = ife.resize_to_square(image)
    new_image = new_image[...,::-1] # BGR to RGB
    ax[i].set_xticks([]) 
    ax[i].set_yticks([]) 
    ax[i].imshow(new_image)

plt.suptitle("Resized images that had the smallest aspect ratios", fontsize=20)
plt.show()

In [None]:
physical_devices = tf.config.list_physical_devices("GPU")

if not physical_devices:
    print("No GPU devices are used in the host.")

In [None]:
pd.set_option('display.max_columns', 32)

In [None]:
def to_latex(df, filename, groupby_keys=[]):
    df.index += 1
    if len(groupby_keys) > 0:
        df = df.groupby(groupby_keys).mean().reset_index()
    df.to_latex(f"{filename}.tex", index=False, column_format='c'*df.shape[1])
    tex_file = []
    with open(f"{filename}.tex", 'r') as f:
        tex_file = f.readlines()
    with open(f"{filename}.tex", 'w') as f:
        f.writelines(["\\begin{table}[H]\n", "\\centering\n"] + tex_file + ["\\caption{}\n", "\\end{table}\n"])

In [None]:
def plot_history(results, variables, nrows, ncols, figsize):
    plt.style.use('default')
    _, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
    ax = ax.flatten() if nrows*ncols > 1 else [ax]
    x = list(range(1,results.shape[0]+1))
    for i in range(len(ax)):
        if i < len(variables):
            ax[i].plot(x, results[variables[i]])
            ax[i].plot(x, results[f'val_{variables[i]}'])
            ax[i].set_title(f'Model {variables[i]}')
            ax[i].set_ylabel(variables[i])
            ax[i].set_xlabel('epoch')
            ax[i].legend(['train', 'val'], loc='upper left')
        else:
            ax[i].set_axis_off()
    plt.show()

### DenseNet121

(run of Version 34 of the cell below was done with the BGR to RGB modification)

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=DenseNet121,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_densenet, img_size=256, average=False,
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
image_features_densenet121_bgr = pd.read_csv(
        "../input/tfg-pet-adoption-data/image_features_DenseNet121_in-256_BGR.csv",
        index_col=0)
image_features_densenet121_bgr.head(5)

By simply looking at some of the values of the two previous dataframes, we can see that having changed the input format to `preprocess_input` from BGR to RGB has had some effect.

### DenseNet201

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=DenseNet201,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_densenet, img_size=256, average=False,
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

### InceptionResNetV2

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=InceptionResNetV2,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_inceptionresnetv2, img_size=256,
                 average=False, save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

### NASNetLarge

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=NASNetLarge,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_nasnet, img_size=256, average=False,
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

### VGG16

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=VGG16,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_vgg16, img_size=256, average=False,
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

### Xception

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=Xception,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_xception, img_size=256, average=False,
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

**If we take a look at the distributions of some of the extracted features, we can see that DenseNets give us normally distributed variables, while the rest are right skewed...**

As the number of features that we get from each CNN is different, we will apply SVD to reduce the number of features and also to determine which pretrained CNN we should focus on.

In [None]:
features_dfs = {}
for model in [DenseNet121, DenseNet201, InceptionResNetV2, NASNetLarge, VGG16, Xception]:
    features_dfs[model.__name__] = pd.read_csv(
        f"../input/tfg-pet-adoption-data/image_features_{model.__name__}_in-256.csv",
        index_col=0)

Let's also define the columns to be removed and the base numeric columns that will be passed to the corresponding transformers in the pipelines that we will use to evaluate the effect of the extracted image features along with the previous data:

In [None]:
columns_to_be_removed_image_feats_eval = ["Name", "Breed1", "Breed2", "Gender", "Color1",
                                          "Color2", "Color3", "Vaccinated", "Dewormed",
                                          "Sterilized", "State", "RescuerID", #"PetID",
                                          "MaturitySize", "FurLength", "Health",
                                          "DescriptionLanguage", "Description",
                                          "ImageMetadataDescription"]

numeric_columns_image_feats_eval = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt",
                                    "StateGDP", "RescuerCount", "DescriptionLength"]

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["CNN", "SVD-n_components",
                        "Average fit time", "Average accuracy", "Average QWK"])

for cnn in [DenseNet121, DenseNet201, InceptionResNetV2, NASNetLarge, VGG16, Xception]:
    for svd_n_components in [16, 32, 64]:
#         for model_desc, model in models.items():
        ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                            loaded_features=features_dfs[cnn.__name__],
                            svd_n_components=svd_n_components)
        
        pipeline_transformers[-4] = ('image_features_extractor', ife)
        
        model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                              use_label_encoder=False)
        print(f"\n\n*************** XGBClassifier with image features from {cnn.__name__}, SVD {svd_n_components} ***************")
        avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                    Pipeline(steps=pipeline_transformers + [('model', model)]),
                    cv, X, y, model_type="classification", display_results=False)
        
        evaluation_results = evaluation_results.append({
            "CNN": cnn.__name__,
            "SVD-n_components": svd_n_components,
            "Average fit time": avg_fit_time,
            "Average accuracy": avg_accuracy,
            "Average QWK": avg_QWK
        },ignore_index=True)
        
display(evaluation_results)

In [None]:
evaluation_results.groupby(["CNN"]).mean()

We can see that in terms of both average accuracy and average QWK, the configurations that gives better results are DenseNet121 with 16 SVD components and NASNetLarge with the same number of components. However, as the number of features that NASNetLarge yields is greater than that of DenseNet121, the average fit time is over 60% higher; consequently, and taking into account that the training time (both forward and backward propagation) would be significantly higher due to the complexity of the network, the best option to continue is DenseNet121.

In [None]:
del features_dfs
to_latex(evaluation_results, "evaluation_results_XGBoostClassifier_varying-CNNs_SVD")

### SVD n_components?

As saw previously, when we evaluated our model with image features extracted from different pretrained CNNs, the best results were obtained using a smaller number of components applying SVD (in particular, 16), plus more features means more fitting time. We also saw that using the raw features (without Average Pooling) is not feasible in terms of time (1096 features in total...) and better results were obtained applying SVD with 16, 32 or 64 components.

Let's see now what is the result of applying SVD with a smaller number of components to the image features:

In [None]:
image_features_densenet121 = pd.read_csv(
        "../input/tfg-pet-adoption-data/image_features_DenseNet121_in-256.csv",
        index_col=0)

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]


evaluation_results = pd.DataFrame([], columns=["SVD-n_components",
                        "Average fit time", "Average accuracy", "Average QWK"])

for svd_n_components in [4, 8, 12, 16, 20, 24, 28]:
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=image_features_densenet121,
                                svd_n_components=svd_n_components)

    pipeline_transformers[-4] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    print(f"\n\n*************** XGBClassifier with image features from DenseNet121, SVD {svd_n_components} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    evaluation_results = evaluation_results.append({"SVD-n_components": svd_n_components,
                        "Average fit time": avg_fit_time, "Average accuracy": avg_accuracy,
                        "Average QWK": avg_QWK}, ignore_index=True)
        

display(evaluation_results)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_varying-SVD-n_components")

It seems that the best balance between average accuracy and average QWK, reducing the dimensionality with SVD to 4 or 16 components are the best options.

### Average?

Let's try to apply now an additional layer of Average Pooling with a size of 4 (so instead of 1024 features we get 256), as the author of the base source code originally proposes, and see if it is better than not applying it:

In [None]:
image_features_densenet121_avg = pd.read_csv(
        "../input/tfg-pet-adoption-data/image_features_DenseNet121_in-256_avg.csv",
        index_col=0)

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer())
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]


evaluation_results = pd.DataFrame([], columns=["SVD-n_components",
                        "Average fit time", "Average accuracy", "Average QWK"])

for svd_n_components in [4, 8, 12, 16, 20, 24, 28, 32]:
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=image_features_densenet121_avg,
                                svd_n_components=svd_n_components)

    pipeline_transformers[-4] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    print(f"\n\n*************** XGBClassifier with image features from DenseNet121, AVG + SVD {svd_n_components} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    evaluation_results = evaluation_results.append({"SVD-n_components": svd_n_components,
                        "Average fit time": avg_fit_time, "Average accuracy": avg_accuracy,
                        "Average QWK": avg_QWK}, ignore_index=True)
        
display(evaluation_results)
to_latex(evaluation_results, "evaluation_results_XGBoostClassifier_DenseNet121__avg__varying-SVD-n_components")

If we look at the obtained results, both raw features and reduzed using SVD, and compare them to those obtained before without applying Average Pooling, we can see that it is better to continue working without applying this last operation, as the results are worse in the average QWK (maybe we are losing information by losing some important dimensions).

### Image size?

Let's try to vary now the input image size, which we will resize every image to, and see whether 256x256, the one we have used so far, is a good option.

In [None]:
for img_size in [224, 384, 512]:
    print(f"\nDenseNet121 output features extracted from {img_size}x{img_size} resized images:")
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=DenseNet121,
                     images_directory="../input/petfinder-adoption-prediction/train_images",
                     preprocess_input=preprocess_input_densenet, img_size=img_size, average=False,
                     save=True, debug=True, include_feats=False)

    _ = ife.fit_transform(X, y)

In [None]:
densenet121_var_img_size_dfs = {}
for img_size in [224, 256, 384, 512]:
    densenet121_var_img_size_dfs[img_size] = pd.read_csv(
        f"../input/tfg-pet-adoption-data/image_features_DenseNet121_in-{img_size}.csv",
        index_col=0)

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer())
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Input size", "SVD-n_components",
                        "Average fit time", "Average accuracy", "Average QWK"])

for img_size in [224, 256, 384, 512]:
    for svd_n_components in [4, 8, 12, 16, 20, 24, 28]:
        ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                    loaded_features=densenet121_var_img_size_dfs[img_size],
                                    svd_n_components=svd_n_components)
        
        pipeline_transformers[-4] = ('image_features_extractor', ife)
        
        model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                              use_label_encoder=False)
        print(f"\n\n*************** XGBClassifier with image features from DenseNet121, {img_size}x{img_size} input, SVD {svd_n_components} ***************")
        avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                    Pipeline(steps=pipeline_transformers + [('model', model)]),
                    cv, X, y, model_type="classification", display_results=False)
        
        evaluation_results = evaluation_results.append({"Input size": img_size,
                            "SVD-n_components": svd_n_components, "Average fit time": avg_fit_time,
                            "Average accuracy": avg_accuracy, "Average QWK": avg_QWK},
                                                       ignore_index=True)
        
display(evaluation_results)

In [None]:
evaluation_results.groupby(["Input size"]).mean()[["Average accuracy", "Average QWK"]]

In [None]:
del densenet121_var_img_size_dfs
to_latex(evaluation_results, "evaluation_results_XGBoostClassifier_DenseNet121__var-img_size__var-SVD-n_components")
to_latex(evaluation_results, "evaluation_results_XGBoostClassifier_DenseNet121__var-img_size__var-SVD-n_components__GROUPED", ["Input size"])

As we can see, the best balance between the average accuracy and average QWK is obtained with an image size of 384x384, but the results are very close to those obtained with 256x256 images, so there is no need to change of input image size. Moreover, if we want to perform transfer learning and train the FC layers (top) of a pretrained CNN, it will be better to resize to 256x256 instead of 384x384 or 512x512, as in those latter cases it will take more time to preprocess, plus the fact that if we want to avoid resizing the images every time we train the CNN, we could resize every image once and save them into a new dataset, so a size of 256x256 will be better in terms of disk storage.

### Training + extraction?

Now, we will perform transfer learning: taking the architecture of the DenseNet121 convolutional layers, the ones that allow us to extract high-level features from input images, and then add some Dense (fully connected) layers on top, in order to perform training on those, while freezing the weights of the bottom layers so that they still retain the weights used for imagenet (that is, we are changing the previous classifier, for imagenet, by another one, that is really what is going on). The goal now is to check whether the Dense layers can give us better features than the ones obtained previously (the best so far, evaluated with XGBoostClassifier and 5-CV, are those obtained from the raw image features, that is, no Average Pooling after the GlobalAveragePooling2D that condenses and flattens the output of the series of, mainly, convolutions, and then reducing those 1024 features to 16 using SVD), by means of training the weights between the layers on top to our target, AdoptionSpeed, or even some of the most representative predictor variables.

First, we will resize all the train images (we will use all the images, not just the profile images) and we will save them in order to create a new dataset, so that we can avoid the resizing step in the preprocessing function:

In [None]:
def resize_and_save_train_images(directory, img_size):
    try:
        if not os.path.isdir(directory):
            os.mkdir(directory)
        ife = ImageFeatureExtractor(use_utils_only=True, img_size=img_size)
        for image_path in tqdm(glob.iglob("../input/petfinder-adoption-prediction/train_images/*.jpg")):
            new_image_path = f"{directory}/{image_path[image_path.rindex('/')+1:]}"
            if not os.path.exists(new_image_path):
                image = cv2.imread(image_path)
                resized_image = ife.resize_to_square(image)
                resized_image = resized_image[...,::-1].astype(np.float32) # BGR to RGB + uint8 to float32
                pil_image = array_to_img(resized_image)
                pil_image.save(new_image_path)
        shutil.make_archive("resized_train_images", "zip", directory)
    except Exception as e:
        print(e)

In [None]:
# resize_and_save_train_images("./resized_train_images", 256)

Now, as we will use the class `ImageDataGenerator` in order to try different data augmentation methods and its `flow_from_dataframe` method, we will have to create a `DataFrame` with a column for the name of each file and other columns for the target values (one-hot encoded for classification, raw for regression).

In [None]:
def get_dataframe_CNN_training_AdoptionSpeed(directory, train_df):
    filenames_and_target_values = {}
    
    for j, image_path in tqdm(enumerate(glob.iglob(f"{directory}/*.jpg"))):
        filename = image_path[image_path.rindex('/')+1:]
        pet_id = filename[:filename.rindex('-')]
        y_value = train_df.loc[train_df["PetID"] == pet_id, "AdoptionSpeed"].values[0]
        filenames_and_target_values[j] = {}
        filenames_and_target_values[j]["filename"] = filename
        filenames_and_target_values[j]["AdoptionSpeed"] = y_value
        for i in range(5):
            filenames_and_target_values[j][f"AdoptionSpeed_{i}"] = 0
        filenames_and_target_values[j][f"AdoptionSpeed_{y_value}"] = 1
    
    filenames_and_target_values_df = pd.DataFrame.from_dict(filenames_and_target_values, orient="index")
    filenames_and_target_values_df = filenames_and_target_values_df.reset_index()
    filenames_and_target_values_df.drop(["index"], axis=1, inplace=True)
    filenames_and_target_values_df.to_csv("dataframe_CNN_training.csv", index=False)
    
    return filenames_and_target_values_df

In [None]:
# filenames_and_target_values = get_dataframe_CNN_training_AdoptionSpeed(
#     "../input/tfg-pet-adoption-resized-train-images", train)

In [None]:
filenames_and_target_values = pd.read_csv("../input/tfg-pet-adoption-data/dataframe_CNN_training.csv")
filenames_and_target_values["PetID"] = filenames_and_target_values["filename"].apply(lambda x: str(x)[:str(x).rindex('-')])
filenames_and_target_values

Now, we will create to `ImageDataGenerator` instances: one for training, with data augmentation, and another for validation, without data augmentation. This way we will validate on unseen images.

In [None]:
train_generator = ImageDataGenerator(
    horizontal_flip=True,
    preprocessing_function=preprocess_input_densenet
)

val_generator = ImageDataGenerator(
    preprocessing_function=preprocess_input_densenet
)

Now, we will divide the DataFrame with the filenames and their correspondant target value (both 'raw' and one-hot encoded) into a training set and a validation set, in a stratified manner on the variable PetID (this way, a single instance will have all its images either in the training or in the validation set, but not in both, so that our validation estimation is not optimistic, as happened in Version 58 and before, since some instances could be validated with a model trained on some of its images), especially in order to have instances with outcome 0 in the validation set:

In [None]:
pet_ids_train, pet_ids_val, _, _ = train_test_split(
    train[["PetID", "AdoptionSpeed"]], train["AdoptionSpeed"],
    test_size=0.2, stratify=train["AdoptionSpeed"], random_state=seed)

In [None]:
pet_ids_train["AdoptionSpeed"].value_counts(normalize=True)

In [None]:
pet_ids_val["AdoptionSpeed"].value_counts(normalize=True)

In [None]:
set_pet_ids_train = set(pet_ids_train["PetID"])
filenames_and_target_values_train_no_instance_overlap = filenames_and_target_values.loc[
    filenames_and_target_values["PetID"].isin(set_pet_ids_train)
].copy()
filenames_and_target_values_val_no_instance_overlap = filenames_and_target_values.loc[
    ~filenames_and_target_values["PetID"].isin(set_pet_ids_train)
].copy()

In [None]:
filenames_and_target_values_train_no_instance_overlap

In [None]:
filenames_and_target_values_val_no_instance_overlap

In [None]:
print(set(filenames_and_target_values_train_no_instance_overlap["PetID"]) & set(filenames_and_target_values_val_no_instance_overlap["PetID"]))
print(set(filenames_and_target_values_train_no_instance_overlap["filename"]) & set(filenames_and_target_values_val_no_instance_overlap["filename"]))

In [None]:
filenames_and_target_values_train_no_instance_overlap["AdoptionSpeed"].value_counts(normalize=True)

In [None]:
filenames_and_target_values_val_no_instance_overlap["AdoptionSpeed"].value_counts(normalize=True)

**Look how the proportion of images of 4 is considerably smaller than the original (28%), since we know that when profiles don't have images, it is very likely that the outcome is 4, plus the fact that we will train over all the images, not just the profile images (so the dicrease is more than 4.4%, while the instances with no profile image whose outcome is 4 is approximately the 1.4% of the total; this is due to the fact that many of the profiles have more than one photo).**

Now, we have to take into account that if we want to test how well the extracted features perform using some model like XGBClassifier in a 5-CV validation strategy, as before, we may obtain optimistic results, since we will train the models with a single Training+Validation split (that is, in each iteration of the 5-CV we may validate extracting features from instances whose profile image was used in the CNN training). Of course we could perform a 5-CV for the training of the CNN, but this would take so much time, that it would be impossible to train some of the models that we will see (which train with multiple data augmentation strategies) without exceding Kaggle's session time limit of 9 hours. Hence, we keep the single split training+validation of the CNNs, but in addition to the 5-CV training and validation strategy, we will also validate the single split of the cell above:

In [None]:
X_train_CNN = X.loc[X["PetID"].isin(set_pet_ids_train)].copy().sample(frac=1, random_state=seed)
y_train_CNN = train.loc[train["PetID"].isin(set_pet_ids_train), "AdoptionSpeed"].copy().sample(frac=1, random_state=seed)
X_val_CNN = X.loc[~X["PetID"].isin(set_pet_ids_train)].copy().sample(frac=1, random_state=seed)
y_val_CNN = train.loc[~train["PetID"].isin(set_pet_ids_train), "AdoptionSpeed"].copy().sample(frac=1, random_state=seed)

In [None]:
X_train_CNN.head(5)

In [None]:
y_train_CNN.head(5)

In [None]:
X_val_CNN.head(5)

In [None]:
y_val_CNN.head(5)

In [None]:
def evaluate_model_single_split(model, X_train, X_test, y_train, y_test, model_type,
                           display_results=True, coefficients=[0.5, 1.5, 2.5, 3.5]):
    orig_model = model
    model = clone(orig_model)
    start = time.time() 
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model.fit(X_train, y_train)
    end = time.time()

    y_pred = model.predict(X_test)

    if model_type == 'regression':
        rmse_value = mean_squared_error(y_test, y_pred, squared=False)
        y_pred = round_reg_predictions(y_pred, coefficients)

    accuracy_value = accuracy_score(y_test, y_pred)

    qwk_value = cohen_kappa_score(y_test, y_pred, weights='quadratic')

    if display_results:
        print("\n----------------- RESULTS SINGLE TRAIN-VAL SPLIT -----------------")
        print(f"Fit time: {end-start} s")
        if model_type == 'regression':
            print("\nRMSE:", rmse_value)
            print("Accuracy:", accuracy_value)
        else:
            print("\nAccuracy:", accuracy_value)
        print("QWK:", qwk_value)
        print("\nClassification report:")
        print(classification_report(y_test, y_pred))

    return accuracy_value, qwk_value

#### Training on AdoptionSpeed

In this section, we will train three different models on the target variable, AdoptionSpeed. These models treat the predictions as classification, regression and ordinal regression, respectively.

##### Classification

First of all, we define the iterators that we will use to extract batches of training and validation images:

In [None]:
batch_size = 32
target_size = (256,256)
classification_target_columns = [f"AdoptionSpeed_{i}" for i in range(5)]

classification_train_iterator = train_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col=classification_target_columns,
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

classification_val_iterator = val_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_val_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col=classification_target_columns,
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

But first of all, as we are going to compare these three models using the results (from the 5-CV) obtained when running XGBClassifier using the extracted image features, we will first define a common setup for the Dense layers that we are going to add on top of DenseNet121's backbone. We will define the number of Dense layers (1 or 2) and the number of neurons of each one (16, 32 or 64) by training and validating the classification model on 5 epochs:

**https://www.tensorflow.org/addons/api_docs/python/tfa/metrics/CohenKappa (https://github.com/tensorflow/addons/tree/master)**

In [None]:
combinations = [
    [16],
    [32],
    [64],
    [64,32],
    [64,16],
    [32,16]
]

evaluation_results = pd.DataFrame([], columns=["Combination", "loss",
                        "accuracy", "qwk", "val_loss", "val_accuracy", "val_qwk"])

for combination in combinations:
    print(f"\n\n{len(combination)} layer{'s' if len(combination) > 1 else ''} {tuple(combination)}")
    inp = Input((256,256,3))
    backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
    
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    
    for i in range(len(combination)):
        x = Dense(combination[i], activation="relu")(x)
    
    out = Dense(5, activation="softmax")(x)

    # Freezing DenseNet121 backbone
    for layer in backbone.layers:
        layer.trainable = False

    model = Model(inp,out)
    
    optimizer = Adam(lr=0.001)
    metrics = ["accuracy", cohen_kappa_keras(num_classes=5, name="qwk", weightage="quadratic")]
    loss = "categorical_crossentropy"
    
    model.compile(loss=loss, metrics=metrics, optimizer=optimizer)
    
    epochs = 5
    batch_size = 32
    history = model.fit(x=classification_train_iterator,
                        validation_data=classification_val_iterator,
                        batch_size=batch_size, epochs=epochs)
    
    train_loss = history.history["loss"]
    train_accuracy = history.history["accuracy"]
    train_qwk = history.history["qwk"]
    validation_loss = history.history["val_loss"]
    validation_accuracy = history.history["val_accuracy"]
    validation_qwk = history.history["val_qwk"]
    
    for i in range(len(train_loss)):
        evaluation_results = evaluation_results.append({
            "Combination": f"{len(combination)}_{tuple(combination)}",
            "loss": train_loss[i], "accuracy": train_accuracy[i], "qwk": train_qwk[i],
            "val_loss": validation_loss[i], "val_accuracy": validation_accuracy[i],
            "val_qwk": validation_qwk[i]
        }, ignore_index=True)

evaluation_results.to_csv("tuning_n-layers_size_DenseNet121_transfer-learning.csv", index=False)
to_latex(evaluation_results, "tuning_n-layers_size_DenseNet121_transfer-learning")
to_latex(evaluation_results, "tuning_n-layers_size_DenseNet121_transfer-learning__GROUPED", ["Combination"])

In [None]:
evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/tuning_n-layers_size_DenseNet121_transfer-learning.csv")
evaluation_results

In [None]:
evaluation_results.groupby(["Combination"]).mean()

As we can see, the best setup in terms of all the validation metrics is the one with 2 layers, the first one 64 and the second one (penultimate layer of the CNN) 16. Thus, we will create the three models that will be trained on more epochs using this architecture of dense layers:

Now, let's define the classification CNN model with the aforementioned Dense layers setup:

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation="relu")(x)
x = Dense(16, activation="relu")(x)
out = Dense(5, activation="softmax")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

classification_model = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["accuracy", cohen_kappa_keras(num_classes=5, name="qwk", weightage="quadratic")]
loss = "categorical_crossentropy"
    
classification_model.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
epochs = 5
batch_size = 32

classification_evaluation_results = pd.DataFrame([], columns=["loss",
                        "accuracy", "qwk", "val_loss", "val_accuracy", "val_qwk"])
    
classification_history = classification_model.fit(x=classification_train_iterator,
                    validation_data=classification_val_iterator,
                    batch_size=batch_size, epochs=epochs)

train_loss = classification_history.history["loss"]
train_accuracy = classification_history.history["accuracy"]
train_qwk = classification_history.history["qwk"]
validation_loss = classification_history.history["val_loss"]
validation_accuracy = classification_history.history["val_accuracy"]
validation_qwk = classification_history.history["val_qwk"]

for i in range(len(train_loss)):
    classification_evaluation_results = classification_evaluation_results.append({
        "loss": train_loss[i], "accuracy": train_accuracy[i], "qwk": train_qwk[i],
        "val_loss": validation_loss[i], "val_accuracy": validation_accuracy[i],
        "val_qwk": validation_qwk[i]
    }, ignore_index=True)

classification_model.save(f"DenseNet121_classification_64-16_{epochs}-epochs.h5")
classification_evaluation_results.to_csv(f"DenseNet121_classification_64-16_{epochs}-epochs.csv", index=False)
to_latex(classification_evaluation_results, f"DenseNet121_classification_64-16_{epochs}-epochs")

In [None]:
classification_evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_classification_64-16_5-epochs.csv")
plot_history(classification_evaluation_results, ["loss", "accuracy", "qwk"], 1, 3, (20,5))

In [None]:
classification_model = load_model("../input/tfg-pet-adoption-data/DenseNet121_classification_64-16_5-epochs.h5")

In [None]:
model = Model(inputs=classification_model.input, outputs=classification_model.layers[-3].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name="DenseNet121_classification_layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
classification_model_layer_64_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_classification_layer-64_5-epochs_in-256.csv",
    index_col=0)
display(classification_model_layer_64_features_5e.head(5))
classification_model_layer_64_features_5e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=classification_model.input, outputs=classification_model.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name="DenseNet121_classification_layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
classification_model_layer_16_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_classification_layer-16_5-epochs_in-256.csv",
    index_col=0)
display(classification_model_layer_16_features_5e.head(5))
classification_model_layer_16_features_5e.describe().loc["std",:].value_counts()

##### Regression

In [None]:
batch_size = 32
target_size = (256,256)

regression_train_iterator = train_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

regression_val_iterator = val_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_val_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation="relu")(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

regression_model = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
epochs = 5
batch_size = 32

regression_evaluation_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])
    
regression_history = regression_model.fit(x=regression_train_iterator,
                    validation_data=regression_val_iterator,
                    batch_size=batch_size, epochs=epochs)

train_loss = regression_history.history["loss"]
train_mae = regression_history.history["mean_absolute_error"]
validation_loss = regression_history.history["val_loss"]
validation_mae = regression_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_evaluation_results = regression_evaluation_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

regression_model.save(f"DenseNet121_regression_64-16_{epochs}-epochs.h5")
regression_evaluation_results.to_csv(f"DenseNet121_regression_64-16_{epochs}-epochs.csv", index=False)
to_latex(regression_evaluation_results, f"DenseNet121_regression_64-16_{epochs}-epochs")

In [None]:
regression_evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression_64-16_5-epochs.csv")
plot_history(regression_evaluation_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

In [None]:
regression_model = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression_64-16_5-epochs.h5")

In [None]:
model = Model(inputs=regression_model.input, outputs=regression_model.layers[-3].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name="DenseNet121_regression_layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_layer_64_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression_layer-64_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_layer_64_features_5e.head(5))
regression_model_layer_64_features_5e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model.input, outputs=regression_model.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name="DenseNet121_regression_layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_layer_16_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression_layer-16_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_layer_16_features_5e.head(5))
regression_model_layer_16_features_5e.describe().loc["std",:].value_counts()

##### Ordinal regression

In [None]:
batch_size = 32
target_size = (256,256)

ordinal_regression_train_iterator = train_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

ordinal_regression_val_iterator = val_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_val_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

**https://arxiv.org/pdf/1901.07884.pdf, https://github.com/ck37/coral-ordinal, https://colab.research.google.com/drive/1AQl4XeqRRhd7l30bmgLVObKt5RFPHttn**

**Issue https://github.com/ck37/coral-ordinal/issues/1 solved in github repo, but not in pip**

In [None]:
!cp -r ../input/tfg-pet-adoption-data/coral-ordinal-master/coral-ordinal-master/* ./
!python setup.py install
import coral_ordinal as coral

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation="relu")(x)
x = Dense(16, activation="relu")(x)
out = coral.CoralOrdinal(num_classes=5)(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

ordinal_regression_model = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = [coral.MeanAbsoluteErrorLabels()]
loss = coral.OrdinalCrossEntropy(num_classes=5)                              
        
ordinal_regression_model.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
epochs = 5
batch_size = 32

ordinal_regression_evaluation_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error_labels", "val_loss",
                        "val_mean_absolute_error_labels"])
    
ordinal_regression_history = ordinal_regression_model.fit(
                    x=ordinal_regression_train_iterator,
                    validation_data=ordinal_regression_val_iterator,
                    batch_size=batch_size, epochs=epochs)

train_loss = ordinal_regression_history.history["loss"]
train_mae = ordinal_regression_history.history["mean_absolute_error_labels"]
validation_loss = ordinal_regression_history.history["val_loss"]
validation_mae = ordinal_regression_history.history["val_mean_absolute_error_labels"]

for i in range(len(train_loss)):
    ordinal_regression_evaluation_results = ordinal_regression_evaluation_results.append({
        "loss": train_loss[i],
        "mean_absolute_error_labels": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error_labels": validation_mae[i]
    }, ignore_index=True)

ordinal_regression_model.save(f"DenseNet121_ordinal_regression_64-16_{epochs}-epochs.h5")
ordinal_regression_evaluation_results.to_csv(f"DenseNet121_ordinal_regression_64-16_{epochs}-epochs.csv", index=False)
to_latex(ordinal_regression_evaluation_results, f"DenseNet121_ordinal_regression_64-16_{epochs}-epochs")

In [None]:
ordinal_regression_evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_ordinal_regression_64-16_5-epochs.csv")
plot_history(ordinal_regression_evaluation_results, ["loss", "mean_absolute_error_labels"], 1, 2, (10,3))

In [None]:
ordinal_regression_model = load_model(
    "../input/tfg-pet-adoption-data/DenseNet121_ordinal_regression_64-16_5-epochs.h5",
    custom_objects={"CoralOrdinal": coral.CoralOrdinal,
                    "OrdinalCrossEntropy": coral.OrdinalCrossEntropy,
                    "MeanAbsoluteErrorLabels": coral.MeanAbsoluteErrorLabels}
)

In [None]:
model = Model(inputs=ordinal_regression_model.input, outputs=ordinal_regression_model.layers[-3].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name="DenseNet121_ordinal_regression_layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
ordinal_regression_model_layer_64_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_ordinal_regression_layer-64_5-epochs_in-256.csv",
    index_col=0)
display(ordinal_regression_model_layer_64_features_5e.head(5))
ordinal_regression_model_layer_64_features_5e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=ordinal_regression_model.input, outputs=ordinal_regression_model.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name="DenseNet121_ordinal_regression_layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
ordinal_regression_model_layer_16_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_ordinal_regression_layer-16_5-epochs_in-256.csv",
    index_col=0)
display(ordinal_regression_model_layer_16_features_5e.head(5))
ordinal_regression_model_layer_16_features_5e.describe().loc["std",:].value_counts()

Let's check how well XGBClassifier performs with the extracted image features from the three previous models (it can handle useless variables, that is, those with std = 0 or very close to 0, but we will create a transformer in order to get ride of those variables without predicting potencial):

In [None]:
class UselessVariablesRemover(BaseEstimator, TransformerMixin):
    def __init__(self, tolerance):
        self.tolerance = tolerance
        self.columns_to_remove = []
        
    def fit(self, X, y):
        X_description = X.describe()
        self.columns_to_remove = X_description.columns[X_description.loc["std"] < self.tolerance]
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        return X.drop(self.columns_to_remove, axis=1)

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "AdoptionSpeed, classification_model_layer-64": classification_model_layer_64_features_5e,
    "AdoptionSpeed, classification_model_layer-16": classification_model_layer_16_features_5e,
    "AdoptionSpeed, regression_model_layer-64": regression_model_layer_64_features_5e,
    "AdoptionSpeed, regression_model_layer-16": regression_model_layer_16_features_5e,
    "AdoptionSpeed, ordinal_regression_model_layer-64": ordinal_regression_model_layer_64_features_5e,
    "AdoptionSpeed, ordinal_regression_model_layer-16": ordinal_regression_model_layer_16_features_5e
}

for features_description, loaded_features in image_features_dataframes.items():
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=loaded_features)
    pipeline_transformers[-5] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier with image features from DenseNet121, {features_description}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({"Model description": model_description,
                        "Average fit time": avg_fit_time, "Average accuracy": avg_accuracy,
                        "Average QWK": avg_QWK, "Single split accuracy": single_accuracy,
                        "Single split QWK": single_QWK}, ignore_index=True)

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_AdoptionSpeed_5-epochs")

The best balance between accuracy and QWK in the single split validation is obtained with the regression model, while the ordinal regression performs significantly worse in this split. Moreover, this is not a matter of "classification and ordinal regression may need more epochs than regression", since, for example, the average QWK in 5-CV is better using features from both layers of the ordinal regression model than the regression model.

#### Training on predictor variables

First of all, we will create a class in order to include it in the pipeline and get the mutual information between the predictor variables and the target in each 5-CV iteration (we will create a dataframe with as many rows as the number of predictor variables and two columns: classification and regression, as we will consider the target as both using the functions `mutual_info_classif` and `mutual_info_regression`, respectively; then, in each iteration, we just sum up the mutual information with the target of each variable):

In [None]:
mutual_info_df_5cv = None

class MutualInfoPredictorVars(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_columns, seed):
        self.numeric_columns = numeric_columns
        self.seed = seed
        self.df = mutual_info_df_5cv
    
    def fit(self, X, y):
        self.numeric_columns += list(filter(
            lambda x: "Breed" in str(x) or "_ordinal" in str(x) or "img_" in str(x) or "desc_" in str(x),
            X.columns))
        self.discrete_columns = list(filter(lambda x: x not in self.numeric_columns,
                                            X.columns))
        self.index_discrete_columns = [i for i, x in enumerate(X.columns)
                                            if x in self.discrete_columns]
        mutual_info_target_classif = mutual_info_classif(X, y,
                            discrete_features=self.index_discrete_columns,
                            random_state=self.seed)
        
        for i, column in enumerate(X.columns):
            self.df.loc[column, "classification"] += mutual_info_target_classif[i]
        
        mutual_info_target_regression = mutual_info_regression(X, y,
                            discrete_features=self.index_discrete_columns,
                            random_state=self.seed)
        
        for i, column in enumerate(X.columns):
            self.df.loc[column, "regression"] += mutual_info_target_regression[i]
        
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        return X

In [None]:
columns_to_be_removed = ["Name", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                         "Vaccinated", "Dewormed", "Sterilized", "State", "RescuerID", "Description",
                         "PetID", "MaturitySize", "FurLength", "Health"]

numeric_columns = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt", "StateGDP", "RescuerCount"]

columns_mutual_info = [
    'Type', 'Age', 'MaturitySize_ordinal', 'FurLength_ordinal', 'Health_ordinal',
    'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'HasName', 'PureBreed',
    'Breed1_AdoptionSpeed_0', 'Breed1_AdoptionSpeed_2', 'Breed1_AdoptionSpeed_3',
    'Breed1_AdoptionSpeed_1', 'Breed1_AdoptionSpeed_4', 'Breed1_freq_encode',
    'Gender_Male', 'Gender_Female', 'Gender_Mixed', 'Color1_Black', 'Color1_Brown',
    'Color1_Cream', 'Color1_Gray', 'Color1_Golden', 'Color1_White', 'Color1_Yellow',
    'Color2_White', 'Color2_Brown', 'Color2_nan', 'Color2_Gray', 'Color2_Cream',
    'Color2_Yellow', 'Color2_Golden', 'Color3_nan', 'Color3_White', 'Color3_Cream',
    'Color3_Gray', 'Color3_Yellow', 'Color3_Golden', 'Vaccinated_No',
    'Vaccinated_Not Sure', 'Vaccinated_Yes', 'Dewormed_No', 'Dewormed_Not Sure',
    'Dewormed_Yes', 'Sterilized_No', 'Sterilized_Not Sure', 'Sterilized_Yes',
    'StateGDP', 'RescuerCount'
]

mutual_info_df_5cv = pd.DataFrame(0, index=columns_mutual_info,
                                  columns=["classification", "regression"])

pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                                "Color3", "Vaccinated", "Dewormed", "Sterilized"])),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed)),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns)),
    ('get_mutual_info', MutualInfoPredictorVars(numeric_columns,
                                                seed))
]


model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
_ = evaluate_model(Pipeline(steps=pipeline_transformers + [('model', model)]),
               cv, X, y, model_type="classification", display_results=False)

display(mutual_info_df_5cv)

In [None]:
list(mutual_info_df_5cv.sort_values('classification', ascending=False).head(10).index)

In [None]:
list(mutual_info_df_5cv.sort_values('regression', ascending=False).head(10).index)

As we can see, the predictor variables that, by themselves (each one), have the greatest predicting power considering the target categorical or continuous are: RescuerCount, Age and Breed1. Thus, we will create a dataframe with the filename of each image (all training images, not just profile images) and the values of the previous variables (including the logarithmic transformation of RescuerCount and Age, and Breed1 will be one-hot encoded).

In [None]:
def get_dataframe_CNN_training_predictors(directory, train_X_df, train_y_df, columns_before,
                               columns_after, transformations, drop_after):  
    train_X_df = train_X_df[["PetID"] + columns_before].copy()
    add_y = False
    for column in columns_before + columns_after:
        if column == train_y_df.name and column in transformations:
            for transformation in transformations[column]:
                train_y_df = transformation.fit_transform(train_y_df)
            add_y = True
        if column in transformations:
            for transformation in transformations[column]:
                train_X_df = transformation.fit_transform(train_X_df, train_y_df)
            
    train_df = train_X_df
    train_df.drop(drop_after, axis=1, inplace=True)
    if add_y:
        if isinstance(train_y_df, pd.DataFrame):
            train_df[train_y_df.columns] = train_y_df
        else:
            train_df[train_y_df.name] = train_y_df
    
    train_df.set_index("PetID", inplace=True)
    
    filenames_and_target_values = {}
    for pet_id in tqdm(train_df.index):
        values_to_include = dict(train_df.loc[pet_id, :])
        for image_path in glob.iglob(f"{directory}/{pet_id}*.jpg"):
            filename = image_path[image_path.rindex('/')+1:]
            filenames_and_target_values[filename] = values_to_include
    
    filenames_and_target_values_df = pd.DataFrame.from_dict(filenames_and_target_values, orient="index")
    filenames_and_target_values_df = filenames_and_target_values_df.reset_index()
    filenames_and_target_values_df.rename(columns={'index': 'filename'}, inplace=True)
    filenames_and_target_values_df.to_csv("dataframe_CNN_training_predictors.csv", index=False)
    
    return filenames_and_target_values_df

In [None]:
# Necessary to use BreedImputer below
def replace_type_integers(X):
    X = X.copy()
    replace_dict = {
        'Type': {1: 'Dog', 2: 'Cat'}
    }
    utils.replace_val_categorical(X, replace_dict)
    return X

In [None]:
# One-hot encode Breed1 after ce.OrdinalEncoder (starts from 1)
# and save it in one column for multioutput training
def one_hot_encoder_breed1(X):
    X = X.copy()
    X['Breed1'] = X['Breed1'] - 1
    X["Breed1_onehot"] = to_categorical(X['Breed1'].values).tolist()
    return X

In [None]:
#Transformer to include logarithmic transformation of RescuerCount and Age
#(we will train on the raw data, and also the transformed ones):
class LogTransform(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for column in self.columns:
            X[f"{column}_log"] = np.log(1 + X[column])
        return X

In [None]:
# ordinal_encoder = ce.OrdinalEncoder(cols=["Breed1"])

# filenames_and_targets_predictors = get_dataframe_CNN_training_predictors(
#     directory="../input/tfg-pet-adoption-resized-train-images",
#     train_X_df=X,
#     train_y_df=y,
#     columns_before=["Age", "Type", "RescuerID", "Breed1", "Breed2"],
#     columns_after=["RescuerCount"],
#     transformations={
#         "Type": [FunctionTransformer(func=replace_type_integers)],
#         "RescuerID": [ReplaceRescuerID()],
#         "Breed1": [LeftJoinReplace(values_dict=breeds_dict,
#                                    variables=["Breed1", "Breed2"]),
#                    BreedImputer(),
#                    ordinal_encoder,
#                    FunctionTransformer(func=one_hot_encoder_breed1)],
#         "Age": [LogTransform(["Age"])],
#         "RescuerCount": [LogTransform(["RescuerCount"])]
#     },
#     drop_after=["Type", "RescuerID", "Breed1", "Breed2"]
# )

In [None]:
filenames_and_targets_predictors = pd.read_csv("../input/tfg-pet-adoption-data/dataframe_CNN_training_predictors.csv")
filenames_and_targets_predictors["Breed1_onehot"] = \
            filenames_and_targets_predictors["Breed1_onehot"].apply(json.loads)
filenames_and_targets_predictors["PetID"] = filenames_and_targets_predictors["filename"].apply(lambda x: str(x)[:str(x).rindex('-')])
filenames_and_targets_predictors

In [None]:
filenames_and_targets_predictors.info()

We can see that all the values of the column Breed1_onehot have a length of 175, the number of observed different breeds in the training dataset:

In [None]:
filenames_and_targets_predictors["Breed1_onehot"].apply(len).value_counts()

The following is the correspondence between each ordinal encoded value (position `value` - 1 in the list) and the Breed (see Version 48):

In [None]:
# ordinal_encoder.mapping[0]["mapping"]

Now, let's create the training and validation split, the same as before. As the order in which the filenames appear in the dataframe with the AdoptionSpeed values is not the same one as the order of this dataframe, we will look into the `filename` values of the training and validation sets of that dataframe:

In [None]:
filenames_and_targets_predictors_train_no_instance_overlap = filenames_and_targets_predictors.loc[
    filenames_and_targets_predictors["PetID"].isin(set_pet_ids_train)
].copy()
filenames_and_targets_predictors_val_no_instance_overlap = filenames_and_targets_predictors.loc[
    ~filenames_and_targets_predictors["PetID"].isin(set_pet_ids_train)
].copy()

In [None]:
print(sorted(filenames_and_target_values_train_no_instance_overlap["filename"].values) == sorted(filenames_and_targets_predictors_train_no_instance_overlap["filename"].values))
print(sorted(filenames_and_target_values_val_no_instance_overlap["filename"].values) == sorted(filenames_and_targets_predictors_val_no_instance_overlap["filename"].values))
print(set(filenames_and_targets_predictors_train_no_instance_overlap["filename"].values) & set(filenames_and_targets_predictors_val_no_instance_overlap["filename"].values))
print(set(filenames_and_targets_predictors_train_no_instance_overlap["PetID"].values) & set(filenames_and_targets_predictors_val_no_instance_overlap["PetID"].values))

**https://keras.io/api/models/model_training_apis/#compile-method --> The loss value that will be minimized by the model will then be the weighted sum of all individual losses, weighted by the loss_weights coefficients.**

As we have multiple outputs of different type, we will define different loss functions; each one will be weighted according to the mean mutual information value (classification and regression), and then the weights of the FC network will be updated on backpropagation according to that weighted sum.

In [None]:
weight_rescuercount = mutual_info_df_5cv.loc["RescuerCount", :].mean()
weight_age = mutual_info_df_5cv.loc["Age", :].mean()
breed1_rows = list(filter(lambda x: "Breed1_AdoptionSpeed_" in x, mutual_info_df_5cv.index))
weight_breed1 = mutual_info_df_5cv.loc[breed1_rows, :].mean().mean()

loss_weights = np.array([weight_rescuercount, weight_age, weight_breed1])
# Normalize so that they sum 1.0
loss_weights /= loss_weights.sum()
display(loss_weights)

Now, let's create the multioutput model and compile it using the appropiate loss functions and the weight of each one, that we have just computed:

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation="relu")(x)
x = Dense(16, activation="relu")(x)
out1 = Dense(1, activation="linear", name="out_rescuer")(x) #RescuerCount (either same or log)
out2 = Dense(1, activation="linear", name="out_age")(x) #Age (either same or log)
out3 = Dense(175, activation="softmax", name="out_breed")(x) #Breed1

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

multioutput_model = Model(inputs=inp, outputs=[out1,out2,out3])

optimizer = Adam(lr=0.001)
metrics = {"out_breed": "accuracy"}
loss = {"out_rescuer": "mean_squared_error",
        "out_age": "mean_squared_error",
        "out_breed": "categorical_crossentropy"}
loss_weights_dict = {"out_rescuer": loss_weights[0],
                "out_age": loss_weights[1],
                "out_breed": loss_weights[2]}

multioutput_model.compile(loss=loss, loss_weights=loss_weights_dict, metrics=metrics, optimizer=optimizer)

The following are the last layers of our multioutput model:

In [None]:
# from keras.utils import plot_model
# plot_model(multioutput_model, show_shapes=True)

In [None]:
Image("../input/tfg-pet-adoption-data/model_zoom_FC.png")

As in previous training processes, we have to define the ImageDataGenerator instances for training and validation (the first one with data augmentation, for now just horizontal flipping of images) and the corresponding iterators:

In [None]:
multioutput_train_generator = ImageDataGenerator(
    horizontal_flip=True,
    preprocessing_function=preprocess_input_densenet
)

multioutput_val_generator = ImageDataGenerator(
    preprocessing_function=preprocess_input_densenet
)

In [None]:
batch_size = 32
target_size = (256,256)

multioutput_train_iterator = multioutput_train_generator.flow_from_dataframe(
    dataframe=filenames_and_targets_predictors_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col=["RescuerCount", "Age", "Breed1_onehot"],
    target_size=target_size,
    class_mode="multi_output",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

multioutput_val_iterator = multioutput_val_generator.flow_from_dataframe(
    dataframe=filenames_and_targets_predictors_val_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col=["RescuerCount", "Age", "Breed1_onehot"],
    target_size=target_size,
    class_mode="multi_output",
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

**?? https://stackoverflow.com/questions/50571641/compilation-options-of-a-multi-output-model-multiple-losses-loss-weighting, https://github.com/keras-team/keras/issues/10306**

**The training below was done using the scaled versions of RescuerCount and Age (x-mean/std):**

In [None]:
epochs = 30
batch_size = 32

multioutput_evaluation_results = pd.DataFrame([], columns=["loss",
                        "out_rescuer_loss", "out_age_loss", "out_breed_loss",
                        "out_breed_accuracy", "val_loss", "val_out_rescuer_loss",
                        "val_out_age_loss", "val_out_breed_loss",
                        "val_out_breed_accuracy"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
checkpoint_callback = ModelCheckpoint(
    "DenseNet121_multioutput_64-16_{epoch:02d}-epoch_val_loss-{val_loss:02f}.h5",
    monitor='val_loss', save_best_only=True)

multioutput_history = multioutput_model.fit(
                    x=multioutput_train_iterator,
                    validation_data=multioutput_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[checkpoint_callback, early_stopping])

train_loss = multioutput_history.history["loss"]
train_rescuer_loss = multioutput_history.history["out_rescuer_loss"]
train_age_loss = multioutput_history.history["out_age_loss"]
train_breed_loss = multioutput_history.history["out_breed_loss"]
train_breed_accuracy = multioutput_history.history["out_breed_accuracy"]
validation_loss = multioutput_history.history["val_loss"]
validation_rescuer_loss = multioutput_history.history["val_out_rescuer_loss"]
validation_age_loss = multioutput_history.history["val_out_age_loss"]
validation_breed_loss = multioutput_history.history["val_out_breed_loss"]
validation_breed_accuracy = multioutput_history.history["val_out_breed_accuracy"]

for i in range(len(train_loss)):
    multioutput_evaluation_results = multioutput_evaluation_results.append({
        "loss": train_loss[i],
        "out_rescuer_loss": train_rescuer_loss[i],
        "out_age_loss": train_age_loss[i],
        "out_breed_loss": train_breed_loss[i],
        "out_breed_accuracy": train_breed_accuracy[i],
        "val_loss": validation_loss[i],
        "val_out_rescuer_loss": validation_rescuer_loss[i],
        "val_out_age_loss": validation_age_loss[i],
        "val_out_breed_loss": validation_breed_loss[i],
        "val_out_breed_accuracy": validation_breed_accuracy[i]
    }, ignore_index=True)

# multioutput_model.save(f"DenseNet121_multioutput_64-16_{epochs}-epochs.h5")
multioutput_evaluation_results.to_csv(f"DenseNet121_multioutput__scaled__64-16_{epochs}-epochs.csv", index=False)
to_latex(multioutput_evaluation_results, f"DenseNet121_multioutput__scaled__64-16_{epochs}-epochs")

# Zipping the checkpoints
!mkdir scaled_no_log
!mv ./DenseNet121_multioutput_64*.h5 scaled_no_log
shutil.make_archive("DenseNet121_multioutput_scaled_no_log_checkpoints", "zip", "./scaled_no_log")

In [None]:
multioutput_evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_multioutput__scaled__64-16_30-epochs.csv")
plot_history(multioutput_evaluation_results,
    ["loss", "out_rescuer_loss", "out_age_loss", "out_breed_loss",
     "out_breed_accuracy"],
    nrows=3, ncols=2, figsize=(10,16))

Beyond epoch 8 the validation loss does not improve, so let's load the model after that epoch:

In [None]:
multioutput_model_scaled = load_model(
    "../input/tfg-pet-adoption-data/DenseNet121_multioutput__scaled__64-16_08-epoch_val_loss-0.919037.h5")

In [None]:
multioutput_model_scaled.layers[-10:]

In [None]:
model = Model(inputs=multioutput_model_scaled.input, outputs=multioutput_model_scaled.layers[-5].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_multioutput__scaled__layer-64_8-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
multioutput_model_scaled_layer_64_features_8e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_multioutput__scaled__layer-64_8-epochs_in-256.csv",
    index_col=0)
display(multioutput_model_scaled_layer_64_features_8e.head(5))
multioutput_model_scaled_layer_64_features_8e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=multioutput_model_scaled.input, outputs=multioutput_model_scaled.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_multioutput__scaled__layer-16_8-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
multioutput_model_scaled_layer_16_features_8e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_multioutput__scaled__layer-16_8-epochs_in-256.csv",
    index_col=0)
display(multioutput_model_scaled_layer_16_features_8e.head(5))
multioutput_model_scaled_layer_16_features_8e.describe().loc["std",:].value_counts()

In [None]:
batch_size = 32
target_size = (256,256)

multioutput_log_train_iterator = multioutput_train_generator.flow_from_dataframe(
    dataframe=filenames_and_targets_predictors_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col=["RescuerCount_log", "Age_log", "Breed1_onehot"],
    target_size=target_size,
    class_mode="multi_output",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

multioutput_log_val_iterator = multioutput_val_generator.flow_from_dataframe(
    dataframe=filenames_and_targets_predictors_val_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col=["RescuerCount_log", "Age_log", "Breed1_onehot"],
    target_size=target_size,
    class_mode="multi_output",
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

**The training below was done using the the logarithmic transformation of RescuerCount and Age (log(1+x)), without scaling:**

In [None]:
epochs = 30
batch_size = 32

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
checkpoint_callback = ModelCheckpoint(
    "DenseNet121_multioutput__log__64-16_{epoch:02d}-epoch_val_loss-{val_loss:02f}.h5",
    monitor='val_loss', save_best_only=True)

multioutput_evaluation_results = pd.DataFrame([], columns=["loss",
                        "out_rescuer_loss", "out_age_loss", "out_breed_loss",
                        "out_breed_accuracy", "val_loss", "val_out_rescuer_loss",
                        "val_out_age_loss", "val_out_breed_loss",
                        "val_out_breed_accuracy"])
    
multioutput_history = multioutput_model.fit(
                    x=multioutput_log_train_iterator,
                    validation_data=multioutput_log_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[checkpoint_callback, early_stopping])

train_loss = multioutput_history.history["loss"]
train_rescuer_loss = multioutput_history.history["out_rescuer_loss"]
train_age_loss = multioutput_history.history["out_age_loss"]
train_breed_loss = multioutput_history.history["out_breed_loss"]
train_breed_accuracy = multioutput_history.history["out_breed_accuracy"]
validation_loss = multioutput_history.history["val_loss"]
validation_rescuer_loss = multioutput_history.history["val_out_rescuer_loss"]
validation_age_loss = multioutput_history.history["val_out_age_loss"]
validation_breed_loss = multioutput_history.history["val_out_breed_loss"]
validation_breed_accuracy = multioutput_history.history["val_out_breed_accuracy"]

for i in range(len(train_loss)):
    multioutput_evaluation_results = multioutput_evaluation_results.append({
        "loss": train_loss[i],
        "out_rescuer_loss": train_rescuer_loss[i],
        "out_age_loss": train_age_loss[i],
        "out_breed_loss": train_breed_loss[i],
        "out_breed_accuracy": train_breed_accuracy[i],
        "val_loss": validation_loss[i],
        "val_out_rescuer_loss": validation_rescuer_loss[i],
        "val_out_age_loss": validation_age_loss[i],
        "val_out_breed_loss": validation_breed_loss[i],
        "val_out_breed_accuracy": validation_breed_accuracy[i]
    }, ignore_index=True)

# multioutput_model.save(f"DenseNet121_multioutput__log__64-16_{epochs}-epochs.h5")
multioutput_evaluation_results.to_csv(f"DenseNet121_multioutput__log__64-16_{epochs}-epochs.csv", index=False)
to_latex(multioutput_evaluation_results, f"DenseNet121_multioutput__log__64-16_{epochs}-epochs")

# Zipping the checkpoints
!mkdir log_no_scaled
!mv ./DenseNet121_multioutput__log__*.h5 log_no_scaled
shutil.make_archive("DenseNet121_multioutput_log_no_scaled_checkpoints", "zip", "./log_no_scaled")

In [None]:
multioutput_evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_multioutput__log__64-16_30-epochs.csv")
plot_history(multioutput_evaluation_results,
    ["loss", "out_rescuer_loss", "out_age_loss", "out_breed_loss",
     "out_breed_accuracy"],
    nrows=3, ncols=2, figsize=(10,16))

In this case, we will load the model after the 10th epoch as beyond that the validation loss does not improve:

In [None]:
multioutput_model_log = load_model(
    "../input/tfg-pet-adoption-data/DenseNet121_multioutput__log__64-16_08-epoch_val_loss-1.582031.h5")

In [None]:
model = Model(inputs=multioutput_model_log.input, outputs=multioutput_model_log.layers[-5].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_multioutput__log__layer-64_8-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
multioutput_model_log_layer_64_features_8e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_multioutput__log__layer-64_8-epochs_in-256.csv",
    index_col=0)
display(multioutput_model_log_layer_64_features_8e.head(5))
multioutput_model_log_layer_64_features_8e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=multioutput_model_log.input, outputs=multioutput_model_log.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_multioutput__log__layer-16_8-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
multioutput_model_log_layer_16_features_8e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_multioutput__log__layer-16_8-epochs_in-256.csv",
    index_col=0)
display(multioutput_model_log_layer_16_features_8e.head(5))
multioutput_model_log_layer_16_features_8e.describe().loc["std",:].value_counts()

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "multioutput_model_scaled_layer-64": multioutput_model_scaled_layer_64_features_8e,
    "multioutput_model_scaled_layer-16": multioutput_model_scaled_layer_16_features_8e,
    "multioutput_model_log_layer-64": multioutput_model_log_layer_64_features_8e,
    "multioutput_model_log_layer-16": multioutput_model_log_layer_16_features_8e
}

for features_description, loaded_features in image_features_dataframes.items():
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=loaded_features)
    pipeline_transformers[-5] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier with image features from DenseNet121, {features_description}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_multioutput_8-epochs-all")

Using some of the predictor variables with the highest mutual information with AdoptionSpeed as target variables in the CNN does not seem to yield better results than using AdoptionSpeed as the CNN's target variable.

#### Data augmentation and Dropout

First of all, let's train the regression model using more data augmentation strategies:

In [None]:
adoptionspeed_train_generator_da = ImageDataGenerator(
    horizontal_flip=True,
    zoom_range=[0.5, 1.0],
#     rotation_range=30,
#     brightness_range=[0.7, 1.0],
    width_shift_range=0.3,
    height_shift_range=0.3,
    preprocessing_function=preprocess_input_densenet
)

adoptionspeed_val_generator = ImageDataGenerator(
    preprocessing_function=preprocess_input_densenet
)

In [None]:
target_size = (256,256)
batch_size = 32

regression_adoptionspeed_train_iterator_da = adoptionspeed_train_generator_da.flow_from_dataframe(
    dataframe=filenames_and_target_values_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

regression_adoptionspeed_val_iterator = adoptionspeed_val_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_val_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation="relu")(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

regression_model_data_aug = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_data_aug.compile(loss=loss, metrics=metrics, optimizer=optimizer)

Now, we will increase the number of epochs as the inputs will vary more than before:

In [None]:
epochs = 30
batch_size = 32

regression_da_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
model_checkpoint = ModelCheckpoint(
    'DenseNet121_regression__data-aug__64-16_{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

regression_da_history = regression_model_data_aug.fit(
                    x=regression_adoptionspeed_train_iterator_da,
                    validation_data=regression_adoptionspeed_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[model_checkpoint, early_stopping])

train_loss = regression_da_history.history["loss"]
train_mae = regression_da_history.history["mean_absolute_error"]
validation_loss = regression_da_history.history["val_loss"]
validation_mae = regression_da_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_da_training_results = regression_da_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

regression_model_data_aug.save(f"DenseNet121_regression__data-aug__64-16_{epochs}-epochs.h5")
regression_da_training_results.to_csv(f"DenseNet121_regression__data-aug__64-16_{epochs}-epochs.csv", index=False)
to_latex(regression_da_training_results, f"DenseNet121_regression__data-aug__64-16_{epochs}-epochs")

# Zipping the checkpoints
!mkdir regression_da_no_dropout
!mv ./DenseNet121_regression__data-aug__*.h5 regression_da_no_dropout
shutil.make_archive("DenseNet121_regression__data-aug__no-dropout__checkpoints", "zip", "./regression_da_no_dropout")

In [None]:
regression_da_training_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression__data-aug__64-16_30-epochs.csv")
plot_history(regression_da_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

The training process stopped after epoch 13 as the validation loss didn't improve since epoch 8. Let's load the model after epoch 8:

In [None]:
regression_model_data_aug = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__data-aug__64-16_06-epochs_val_loss-1.171272.h5")

In [None]:
model = Model(inputs=regression_model_data_aug.input, outputs=regression_model_data_aug.layers[-3].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__data-aug__layer-64_6-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_data_aug_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__data-aug__layer-64_6-epochs_in-256.csv",
    index_col=0)
display(regression_model_data_aug_layer_64_features.head(5))
regression_model_data_aug_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_data_aug.input, outputs=regression_model_data_aug.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__data-aug__layer-16_6-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_data_aug_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__data-aug__layer-16_6-epochs_in-256.csv",
    index_col=0)
display(regression_model_data_aug_layer_16_features.head(5))
regression_model_data_aug_layer_16_features.describe().loc["std",:].value_counts()

Now, let's train it without data augmentation (just horizontal flipping, as before) but with Dropout layers, with the same number of epochs as was done originally, 5, in order to see whether more features are useful (we saw that the outputs of layer Dense 16 were all 0 except one feature). We will test 3 setups: dropout before layer Dense 64, dropout before layer Dense 16 and dropout before both.

In [None]:
target_size = (256,256)
batch_size = 32

adoptionspeed_train_generator = ImageDataGenerator(
    horizontal_flip=True,
    preprocessing_function=preprocess_input_densenet
)

regression_adoptionspeed_train_iterator = adoptionspeed_train_generator.flow_from_dataframe(
    dataframe=filenames_and_target_values_train_no_instance_overlap,
    directory="../input/tfg-pet-adoption-resized-train-images/",
    x_col="filename",
    y_col="AdoptionSpeed",
    target_size=target_size,
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(64, activation="relu")(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

regression_model_dropout_64 = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation="relu")(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

regression_model_dropout_16 = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_16.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

regression_model_dropout_64_16 = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64_16.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
epochs = 5
batch_size = 32

regression_models_dropout = {
    "regression__dropout_64": regression_model_dropout_64,
    "regression__dropout_16": regression_model_dropout_16,
    "regression__dropout_64-16": regression_model_dropout_64_16
}

for model_desc, regression_model in regression_models_dropout.items():
    regression_dropout_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])
    
    regression_history = regression_model.fit(
                        x=regression_adoptionspeed_train_iterator,
                        validation_data=regression_adoptionspeed_val_iterator,
                        batch_size=batch_size, epochs=epochs)

    train_loss = regression_history.history["loss"]
    train_mae = regression_history.history["mean_absolute_error"]
    validation_loss = regression_history.history["val_loss"]
    validation_mae = regression_history.history["val_mean_absolute_error"]

    for i in range(len(train_loss)):
        regression_dropout_training_results = regression_dropout_training_results.append({
            "loss": train_loss[i],
            "mean_absolute_error": train_mae[i],
            "val_loss": validation_loss[i],
            "val_mean_absolute_error": validation_mae[i]
        }, ignore_index=True)

    regression_model.save(f"DenseNet121_{model_desc}__64-16_{epochs}-epochs.h5")
    regression_dropout_training_results.to_csv(f"DenseNet121_{model_desc}__64-16_{epochs}-epochs.csv", index=False)
    to_latex(regression_dropout_training_results, f"DenseNet121_{model_desc}__64-16_{epochs}-epochs")

In [None]:
regression_models_dropout = ["dropout_64", "dropout_16", "dropout_64-16"]
for model_desc in regression_models_dropout:
    regression_training_results_dropout = pd.read_csv(f"../input/tfg-pet-adoption-data/DenseNet121_regression__{model_desc}__64-16_5-epochs.csv")
    plot_history(regression_training_results_dropout, ["loss", "mean_absolute_error"], 1, 2, (12,3))

Let's extract the outputs of the 64 and 16 neurons layers of each model with dropout:

In [None]:
regression_model_dropout_64 = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_64__64-16_5-epochs.h5")

In [None]:
model = Model(inputs=regression_model_dropout_64.input, outputs=regression_model_dropout_64.layers[-3].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_64__layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_64_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_64__layer-64_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_64_layer_64_features.head(5))
regression_model_dropout_64_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64.input, outputs=regression_model_dropout_64.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_64__layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_64_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_64__layer-16_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_64_layer_16_features.head(5))
regression_model_dropout_64_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
regression_model_dropout_16 = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_16__64-16_5-epochs.h5")

In [None]:
model = Model(inputs=regression_model_dropout_16.input, outputs=regression_model_dropout_16.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_16__layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_16_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_16__layer-64_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_16_layer_64_features.head(5))
regression_model_dropout_16_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_16.input, outputs=regression_model_dropout_16.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_16__layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_16_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_16__layer-16_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_16_layer_16_features.head(5))
regression_model_dropout_16_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
regression_model_dropout_64_16 = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_64-16__64-16_5-epochs.h5")

In [None]:
model = Model(inputs=regression_model_dropout_64_16.input, outputs=regression_model_dropout_64_16.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_64-16__layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_64_16_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_64-16__layer-64_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_64_16_layer_64_features.head(5))
regression_model_dropout_64_16_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16.input, outputs=regression_model_dropout_64_16.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_64-16__layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_64_16_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_64-16__layer-16_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_64_16_layer_16_features.head(5))
regression_model_dropout_64_16_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "AdoptionSpeed, regression_model_data-aug_layer-64_6e": regression_model_data_aug_layer_64_features,
    "AdoptionSpeed, regression_model_data-aug_layer-16_6e": regression_model_data_aug_layer_16_features,
    "AdoptionSpeed, regression_model_dropout-64_layer-64_5e": regression_model_dropout_64_layer_64_features,
    "AdoptionSpeed, regression_model_dropout-64_layer-16_5e": regression_model_dropout_64_layer_16_features,
    "AdoptionSpeed, regression_model_dropout-16_layer-64_5e": regression_model_dropout_16_layer_64_features,
    "AdoptionSpeed, regression_model_dropout-16_layer-16_5e": regression_model_dropout_16_layer_16_features,
    "AdoptionSpeed, regression_model_dropout-64-16_layer-64_5e": regression_model_dropout_64_16_layer_64_features,
    "AdoptionSpeed, regression_model_dropout-64-16_layer-16_5e": regression_model_dropout_64_16_layer_16_features
}

for features_description, loaded_features in image_features_dataframes.items():
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=loaded_features)
    pipeline_transformers[-5] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier, DenseNet121, {features_description}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_regression__data-aug__dropout")

It is no surprise that in a limited number of epochs the models with one Dropout layer perform better than that with two dropout layer. However, as we will unfreeze some of the DenseNet121 backbone layers in the next section and train with data augmentation, it will be better to use two Dropout layers in order to generalize better. 

#### Data augmentation + Dropout + Fine-tuning

Let's train the regression model with 2 Dropout layers using the training iterator with several data augmentation methods on a few epochs (we will monitor the validation loss, when it does not improve, the training is finished):

In [None]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)

# Freezing DenseNet121 backbone
for layer in backbone.layers:
    layer.trainable = False

regression_model_dropout_64_16 = Model(inp,out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64_16.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
epochs = 30
batch_size = 32

regression_dropout_data_aug_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
model_checkpoint = ModelCheckpoint(
    'DenseNet121_regression__dropout_64-16__data-aug__64-16_{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

regression_dropout_data_aug_history = regression_model_dropout_64_16.fit(
                    x=regression_adoptionspeed_train_iterator_da,
                    validation_data=regression_adoptionspeed_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[model_checkpoint, early_stopping])

train_loss = regression_dropout_data_aug_history.history["loss"]
train_mae = regression_dropout_data_aug_history.history["mean_absolute_error"]
validation_loss = regression_dropout_data_aug_history.history["val_loss"]
validation_mae = regression_dropout_data_aug_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_dropout_data_aug_training_results = regression_dropout_data_aug_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

# regression_model_dropout_64_16.save(f"DenseNet121_regression__dropout_64-16__data-aug__64-16_{epochs}-epochs.h5")
regression_dropout_data_aug_training_results.to_csv(f"DenseNet121_regression__dropout_64-16__data-aug__64-16_{epochs}-epochs.csv", index=False)
to_latex(regression_dropout_data_aug_training_results, f"DenseNet121_regression__dropout_64-16__data-aug__64-16_{epochs}-epochs")

# Zipping the checkpoints
!mkdir regression_dropout_data_aug
!mv ./DenseNet121_regression__dropout_64-16__data-aug*.h5 regression_dropout_data_aug
shutil.make_archive("DenseNet121_regression__dropout_64-16__data-aug__checkpoints", "zip", "./regression_dropout_data_aug")

In [None]:
regression_dropout_data_aug_training_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_64-16__data-aug__64-16_30-epochs.csv")
plot_history(regression_dropout_data_aug_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

In [None]:
regression_model_dropout_64_16_6e = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_64-16__data-aug__64-16_06-epochs_val_loss-1.171635.h5")

In [None]:
model = Model(inputs=regression_model_dropout_64_16_6e.input, outputs=regression_model_dropout_64_16_6e.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_64-16__data-aug__layer-64_6-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_64_16_data_aug_layer_64_features_6e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_64-16__data-aug__layer-64_6-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_64_16_data_aug_layer_64_features_6e.head(5))
regression_model_dropout_64_16_data_aug_layer_64_features_6e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_6e.input, outputs=regression_model_dropout_64_16_6e.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__dropout_64-16__data-aug__layer-16_6-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_dropout_64_16_data_aug_layer_16_features_6e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__dropout_64-16__data-aug__layer-16_6-epochs_in-256.csv",
    index_col=0)
display(regression_model_dropout_64_16_data_aug_layer_16_features_6e.head(5))
regression_model_dropout_64_16_data_aug_layer_16_features_6e.describe().loc["std",:].value_counts()

**Now, let's unfreeze the last convolutional block of DenseNet121 backbone and compile the model with a smaller learning rate:**

In [None]:
for i, layer in enumerate(regression_model_dropout_64_16_6e.layers[375:], start=375):
    print(i, layer.name)

In [None]:
for layer in regression_model_dropout_64_16_6e.layers[:418]:
    layer.trainable = False
for layer in regression_model_dropout_64_16_6e.layers[418:]:
    layer.trainable = True

optimizer = Adam(lr=0.0001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64_16_6e.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
epochs = 15
batch_size = 32

regression_fine_tune_dropout_data_aug_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=4)
model_checkpoint = ModelCheckpoint(
    'DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16_{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

regression_fine_tune_dropout_data_aug_history = regression_model_dropout_64_16_6e.fit(
                    x=regression_adoptionspeed_train_iterator_da,
                    validation_data=regression_adoptionspeed_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[model_checkpoint, early_stopping])

train_loss = regression_fine_tune_dropout_data_aug_history.history["loss"]
train_mae = regression_fine_tune_dropout_data_aug_history.history["mean_absolute_error"]
validation_loss = regression_fine_tune_dropout_data_aug_history.history["val_loss"]
validation_mae = regression_fine_tune_dropout_data_aug_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_fine_tune_dropout_data_aug_training_results = regression_fine_tune_dropout_data_aug_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

regression_fine_tune_dropout_data_aug_training_results.to_csv(f"DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16_{epochs}-epochs.csv", index=False)
to_latex(regression_fine_tune_dropout_data_aug_training_results, f"DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16_{epochs}-epochs")

# Zipping the checkpoints
!mkdir regression_fine_tune_dropout_data_aug
!mv ./DenseNet121_regression__fine-tune__dropout_64-16__data-aug*.h5 regression_fine_tune_dropout_data_aug
shutil.make_archive("DenseNet121_regression__fine-tune__dropout_64-16__data-aug__checkpoints", "zip", "./regression_fine_tune_dropout_data_aug")

In [None]:
regression_fine_tune_dropout_data_aug_training_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16_15-epochs.csv")
plot_history(regression_fine_tune_dropout_data_aug_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

In [None]:
regression_model_dropout_64_16_5e = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16_05-epochs_val_loss-1.150202.h5")

In [None]:
model = Model(inputs=regression_model_dropout_64_16_5e.input, outputs=regression_model_dropout_64_16_5e.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__fine-tune__dropout_64-16__data-aug__layer-64_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_fine_tune_dropout_64_16_data_aug_layer_64_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__fine-tune__dropout_64-16__data-aug__layer-64_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_fine_tune_dropout_64_16_data_aug_layer_64_features_5e.head(5))
regression_model_fine_tune_dropout_64_16_data_aug_layer_64_features_5e.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_5e.input, outputs=regression_model_dropout_64_16_5e.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__fine-tune__dropout_64-16__data-aug__layer-16_5-epochs",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
regression_model_fine_tune_dropout_64_16_data_aug_layer_16_features_5e = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__fine-tune__dropout_64-16__data-aug__layer-16_5-epochs_in-256.csv",
    index_col=0)
display(regression_model_fine_tune_dropout_64_16_data_aug_layer_16_features_5e.head(5))
regression_model_fine_tune_dropout_64_16_data_aug_layer_16_features_5e.describe().loc["std",:].value_counts()

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "AdoptionSpeed, regression_model_dropout-64-16_data-aug_layer-64_6e": regression_model_dropout_64_16_data_aug_layer_64_features_6e,
    "AdoptionSpeed, regression_model_dropout-64-16_data-aug_layer-16_6e": regression_model_dropout_64_16_data_aug_layer_16_features_6e,
    "AdoptionSpeed, regression_model_fine-tune_dropout-64-16_data-aug_layer-64_5e": regression_model_fine_tune_dropout_64_16_data_aug_layer_64_features_5e,
    "AdoptionSpeed, regression_model_fine-tune_dropout-64-16_data-aug_layer-16_5e": regression_model_fine_tune_dropout_64_16_data_aug_layer_16_features_5e
}

for features_description, loaded_features in image_features_dataframes.items():
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=loaded_features)
    pipeline_transformers[-5] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier, DenseNet121, {features_description}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)
    

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_regression__fine-tune__data-aug__dropout")

### Aggregation of several images features?

Let's try to include the information from all the images of each profile using their extracted features:

#### Raw features (1024) + SVD 16

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=True, cnn_backbone=DenseNet121,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 preprocess_input=preprocess_input_densenet, img_size=256, average=False,
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
image_features_densenet121 = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_in-256.csv",
    index_col=0)
display(image_features_densenet121.head(5))

In [None]:
all_image_features_densenet121 = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_in-256.csv",
    index_col=0)
display(all_image_features_densenet121.head(5))

In [None]:
aggregated_image_features_densenet121 = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_in-256.csv",
    index_col=0)
display(aggregated_image_features_densenet121.head(5))

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "raw-1024_single": image_features_densenet121,
    "raw-1024_all": all_image_features_densenet121,
    "raw-1024_agg_mean_sum_var": aggregated_image_features_densenet121
}

for features_description, loaded_features in image_features_dataframes.items():
    for svd_n_components in [4, 16]:
        ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                    loaded_features=loaded_features,
                                    svd_n_components=svd_n_components,
                                    multiple_instances_per_petid="all" in features_description)
        pipeline_transformers[-5] = ('image_features_extractor', ife)

        model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                              use_label_encoder=False)
        model_description = f"XGBClassifier, DenseNet121, {features_description}, SVD {svd_n_components}"
        print(f"\n\n*************** {model_description} ***************")
        avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                    Pipeline(steps=pipeline_transformers + [('model', model)]),
                    cv, X, y, model_type="classification", display_results=True,
                    display_plots=False)

        single_accuracy, single_QWK = evaluate_model_single_split(
                    Pipeline(steps=pipeline_transformers + [('model', model)]),
                    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                    model_type="classification", display_results=True)

        evaluation_results = evaluation_results.append({
            "Model description": model_description,
            "Average fit time": avg_fit_time,
            "Average accuracy": avg_accuracy,
            "Average QWK": avg_QWK,
            "Single split accuracy": single_accuracy,
            "Single split QWK": single_QWK
        }, ignore_index=True)
    

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_raw-1024_single_and_aggregation_multiple_images_svd-4")

In [None]:
del image_features_densenet121,
del all_image_features_densenet121,
del aggregated_image_features_densenet121

#### Fine-tuned model (dropout 64-16)

In [None]:
regression_model_dropout_64_16_fine_tune = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16_05-epochs_val_loss-1.150202.h5")

In [None]:
model = Model(inputs=regression_model_dropout_64_16_fine_tune.input,
              outputs=regression_model_dropout_64_16_fine_tune.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16__layer-64_5-epochs",
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
all_image_features_regression_model_dropout_64_16_fine_tune_layer_64 = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16__layer-64_5-epochs_in-256.csv",
    index_col=0)
display(all_image_features_regression_model_dropout_64_16_fine_tune_layer_64.head(5))

In [None]:
aggregated_image_features_regression_model_dropout_64_16_fine_tune_layer_64 = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16__layer-64_5-epochs_in-256.csv",
    index_col=0)
display(aggregated_image_features_regression_model_dropout_64_16_fine_tune_layer_64.head(5))

In [None]:
model = Model(inputs=regression_model_dropout_64_16_fine_tune.input,
              outputs=regression_model_dropout_64_16_fine_tune.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16__layer-16_5-epochs",
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
all_image_features_regression_model_dropout_64_16_fine_tune_layer_16 = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16__layer-16_5-epochs_in-256.csv",
    index_col=0)
display(all_image_features_regression_model_dropout_64_16_fine_tune_layer_16.head(5))

In [None]:
aggregated_image_features_regression_model_dropout_64_16_fine_tune_layer_16 = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__fine-tune__dropout_64-16__data-aug__64-16__layer-16_5-epochs_in-256.csv",
    index_col=0)
display(aggregated_image_features_regression_model_dropout_64_16_fine_tune_layer_16.head(5))

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "regression_model_fine-tune_dropout-64-16_data-aug_layer-64_single": regression_model_fine_tune_dropout_64_16_data_aug_layer_64_features_5e,
    "regression_model_fine-tune_dropout-64-16_data-aug_layer-64_agg_mean_sum_var": aggregated_image_features_regression_model_dropout_64_16_fine_tune_layer_64,
    "regression_model_fine-tune_dropout-64-16_data-aug_layer-16_single": regression_model_fine_tune_dropout_64_16_data_aug_layer_16_features_5e,
    "regression_model_fine-tune_dropout-64-16_data-aug_layer-16_agg_mean_sum_var": aggregated_image_features_regression_model_dropout_64_16_fine_tune_layer_16
}

for features_description, loaded_features in image_features_dataframes.items():
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=loaded_features,
                                multiple_instances_per_petid=False)
    pipeline_transformers[-5] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier, DenseNet121, {features_description}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)
    

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_regression_fine-tune_dropout_64-16_single_and_aggregation_multiple_images")

#### Multi-input model

**https://datascience.stackexchange.com/questions/38111/classification-problem-with-many-images-per-instance**

In [None]:
inputs = []
cnns_outputs = []

for i in range(1, 6):
    inp = Input((256,256,3))
    inputs.append(inp)
    backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    cnns_outputs.append(x)
    
    # Freezing DenseNet121 backbone layers
    for layer in backbone.layers:
        layer._name = layer.name + f"_{i}"
        layer.trainable = False

average = Average()(cnns_outputs)
maximum = Maximum()(cnns_outputs)
addition = Add()(cnns_outputs)

concatenation = Concatenate()([average, maximum, addition])
x = Dense(256, activation="relu")(concatenation)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(64, activation="relu")(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)
 
regression_model_dropout_64_16_multi_input = Model(inputs=inputs, outputs=out)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64_16_multi_input.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
# plot_model(regression_model_dropout_64_16_multi_input, show_shapes=True)

In [None]:
Image("../input/tfg-pet-adoption-data/model_bottom.png")

In [None]:
train["PhotoAmt"].describe()

In [None]:
def get_dataframe_CNN_training_AdoptionSpeed_multi_input(train_df, num_inputs):
    filenames_and_target_values = {}
    
    for i, (index, row) in tqdm(enumerate(train_df.iterrows())):
        pet_id = row["PetID"]
        photo_amt = int(row["PhotoAmt"])
        adoption_speed = int(row["AdoptionSpeed"])
        if photo_amt > 0:
            columns = {}
            columns["PetID"] = pet_id
            columns["AdoptionSpeed"] = adoption_speed
            columns["PhotoAmt"] = photo_amt
            columns[f"filename{random.randint(1, num_inputs)}"] = f"{pet_id}-1.jpg"
            remaining_files = list(range(2, photo_amt+1))
            random.shuffle(remaining_files)
            if photo_amt < num_inputs:
                fill_remaining_files = list(range(1, photo_amt+1))
                random.shuffle(fill_remaining_files)
                num_remaining_files = len(remaining_files)
                for _ in range(num_remaining_files, num_inputs-1):
                    if len(fill_remaining_files) > 0:
                        remaining_files.append(fill_remaining_files.pop())
                    else:
                        remaining_files.append(random.randint(1, photo_amt))
            for j in range(1, num_inputs+1):
                if f"filename{j}" not in columns:
                    columns[f"filename{j}"] =  f"{pet_id}-{remaining_files.pop()}.jpg"
            filenames_and_target_values[i] = columns
        
    filenames_and_target_values_df = pd.DataFrame.from_dict(filenames_and_target_values, orient="index")
    filenames_and_target_values_df.reset_index(drop=True, inplace=True)
    filenames_and_target_values_df.to_csv(f"dataframe_CNN_training_{num_inputs}-img_inputs.csv", index=False)

    return filenames_and_target_values_df

In [None]:
# filenames_and_target_values_5_img_inputs = get_dataframe_CNN_training_AdoptionSpeed_multi_input(
#     train_df=train,
#     num_inputs=5
# )
filenames_and_target_values_5_img_inputs = pd.read_csv("../input/tfg-pet-adoption-data/dataframe_CNN_training_5-img_inputs.csv")

In [None]:
filenames_and_target_values_5_img_inputs.head(10)

In [None]:
filenames_and_target_values_5_img_inputs_train = \
    filenames_and_target_values_5_img_inputs.loc[
        filenames_and_target_values_5_img_inputs["PetID"].isin(set_pet_ids_train)
    ].copy()
filenames_and_target_values_5_img_inputs_val = \
    filenames_and_target_values_5_img_inputs.loc[
        ~filenames_and_target_values_5_img_inputs["PetID"].isin(set_pet_ids_train)
    ].copy()

In [None]:
filenames_and_target_values_5_img_inputs_train

In [None]:
filenames_and_target_values_5_img_inputs_val

In [None]:
filenames_and_target_values_5_img_inputs_train["AdoptionSpeed"].value_counts(normalize=True)

In [None]:
filenames_and_target_values_5_img_inputs_val["AdoptionSpeed"].value_counts(normalize=True)

In [None]:
print(set(filenames_and_target_values_5_img_inputs_train["PetID"]) & set(filenames_and_target_values_5_img_inputs_val["PetID"]))

**https://stackoverflow.com/questions/49404993/how-to-use-fit-generator-with-multiple-inputs**

In [None]:
def get_train_iterator_multi_input(num_inputs, train_dataframe, target_size, batch_size, seed=seed):
    train_generator = ImageDataGenerator(
        horizontal_flip=True,
        zoom_range=[0.5, 1.0],
        width_shift_range=0.3,
        height_shift_range=0.3,
        preprocessing_function=preprocess_input_densenet
    )
    
    train_iterators = {}
    for i in range(1, num_inputs+1):
        train_iterators[i] = train_generator.flow_from_dataframe(
            dataframe=train_dataframe,
            directory="../input/tfg-pet-adoption-resized-train-images/",
            x_col=f"filename{i}",
            y_col="AdoptionSpeed",
            target_size=target_size,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=True,
            seed=seed
        )
    
    while True:
        X_i = []
        for i in range(1, num_inputs+1):
            X_i.append(next(train_iterators[i]))
        yield random.sample([x[0] for x in X_i], num_inputs), X_i[0][1]

In [None]:
def get_val_iterator_multi_input(num_inputs, val_dataframe, target_size, batch_size, seed=seed):
    val_generator = ImageDataGenerator(
        preprocessing_function=preprocess_input_densenet
    )
    
    val_iterators = {}
    for i in range(1, num_inputs+1):
        val_iterators[i] = val_generator.flow_from_dataframe(
            dataframe=val_dataframe,
            directory="../input/tfg-pet-adoption-resized-train-images/",
            x_col=f"filename{i}",
            y_col="AdoptionSpeed",
            target_size=target_size,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=False,
            seed=seed
        )
    
    while True:
        X_i = []
        for i in range(1, num_inputs+1):
            X_i.append(next(val_iterators[i]))
        yield [x[0] for x in X_i], X_i[0][1]

In [None]:
epochs = 30
batch_size = 32
target_size = (256,256)

regression_multi_input_dropout_64_16_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=10)
model_checkpoint = ModelCheckpoint(
    'DenseNet121_regression__multi-input-5__dropout_64-16__{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

train_iterator_multi_input = get_train_iterator_multi_input(
    num_inputs=5,
    train_dataframe=filenames_and_target_values_5_img_inputs_train,
    target_size=target_size,
    batch_size=batch_size,
    seed=seed
)

val_iterator_multi_input = get_val_iterator_multi_input(
    num_inputs=5,
    val_dataframe=filenames_and_target_values_5_img_inputs_val,
    target_size=target_size,
    batch_size=batch_size,
    seed=seed
)

regression_multi_input_dropout_64_16_training_history = \
    regression_model_dropout_64_16_multi_input.fit(
                    x=train_iterator_multi_input,
                    validation_data=val_iterator_multi_input,
                    batch_size=batch_size,
                    steps_per_epoch=math.ceil(filenames_and_target_values_5_img_inputs_train.shape[0] / batch_size),
                    validation_steps=math.ceil(filenames_and_target_values_5_img_inputs_val.shape[0] / batch_size),
                    epochs=epochs,
                    callbacks=[model_checkpoint, early_stopping])

train_loss = regression_multi_input_dropout_64_16_training_history.history["loss"]
train_mae = regression_multi_input_dropout_64_16_training_history.history["mean_absolute_error"]
validation_loss = regression_multi_input_dropout_64_16_training_history.history["val_loss"]
validation_mae = regression_multi_input_dropout_64_16_training_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_multi_input_dropout_64_16_training_results = regression_multi_input_dropout_64_16_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

regression_multi_input_dropout_64_16_training_results.to_csv(f"DenseNet121_regression__multi-input-5__dropout_64-16__{epochs}-epochs.h5.csv", index=False)
to_latex(regression_multi_input_dropout_64_16_training_results, f"DenseNet121_regression__multi-input-5__dropout_64-16__{epochs}-epochs.h5.csv")

# Zipping the checkpoints
!mkdir regression_multi_input_5_dropout
!mv ./DenseNet121_regression__multi-input-5*.h5 regression_multi_input_5_dropout
shutil.make_archive("DenseNet121_regression__multi-input-5__dropout_64-16__checkpoints", "zip", "./regression_multi_input_5_dropout")

In [None]:
regression_multi_input_dropout_64_16_training_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression__multi-input-5__dropout_64-16__30-epochs.csv")
plot_history(regression_multi_input_dropout_64_16_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

The results are bad, the lowest validation loss is 1.38, which is considerably greater than those values that we obtained with single-input CNNs. The same happens when we add a Dense 256 layer before the 64 one just in case we are underfitting. Moreover, this is probably a wrong approach, since even if we just want to use the 5 first photos of a profile, in those cases where the number of photos is smaller, some photos will have more importance (if there are 3 photos, we would input 1 2 3 1 2, in whatever order, which is another problem, giving more importance to the profile photo, which is okay, but also to 2 over 3).

### Ensemble (single input)

We will create now a ensemble of three DenseNet121 backbones: the first one without any trainable layer, the second one with the last dense block as trainable and Dropout of 0.1 after the GlobalAveragePooling2D layer and the third one with the two last dense blocks as trainable and Dropout of 0.2 after the GlobalAveragePooling2D layer. Then, we aggregate the outputs using the average and the maximum and concatenate both.

In [None]:
inp = Input((256,256,3))
cnns_outputs = []

for i in range(3):
    backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    if i > 0:
        x = Dropout(rate=0.1*i, seed=seed)(x)
    cnns_outputs.append(x)
    
    # Freezing DenseNet121 backbone layers
    for j, layer in enumerate(backbone.layers):
        layer._name = f"{layer.name}_{i}"
        if (i == 0) or (i == 1 and j < 418) or (i == 2 and j < 411):
            layer.trainable = False
        else:
            layer.trainable = True
            
average = Average()(cnns_outputs)
maximum = Maximum()(cnns_outputs)

concatenation = Concatenate()([average, maximum])
x = Dropout(rate=0.25, seed=seed)(concatenation)
x = Dense(64, activation="relu")(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)
 
regression_model_dropout_64_16_ensemble = Model(inputs=inp, outputs=out)

# To solve RuntimeError: Unable to create link (name already exists)
# https://stackoverflow.com/questions/64118599/getting-the-runtimeerror-unable-to-create-link-name-already-exists-with-a-mul
for i, w in enumerate(regression_model_dropout_64_16_ensemble.weights):
    split_name = w.name.split('/')
    new_name = split_name[0] + '_' + str(i) + '/' + split_name[1] + '_' + str(i)
    regression_model_dropout_64_16_ensemble.weights[i]._handle_name = new_name

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64_16_ensemble.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
# plot_model(regression_model_dropout_64_16_ensemble, show_shapes=True)
Image("../input/tfg-pet-adoption-data/ensemble_model_zoom_output.png")

In [None]:
epochs = 30
batch_size = 32

regression_ensemble_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
model_checkpoint = ModelCheckpoint(
    'DenseNet121_regression__ensemble__{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

regression_model_dropout_64_16_ensemble_history = regression_model_dropout_64_16_ensemble.fit(
                    x=regression_adoptionspeed_train_iterator_da,
                    validation_data=regression_adoptionspeed_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[model_checkpoint, early_stopping])

train_loss = regression_model_dropout_64_16_ensemble_history.history["loss"]
train_mae = regression_model_dropout_64_16_ensemble_history.history["mean_absolute_error"]
validation_loss = regression_model_dropout_64_16_ensemble_history.history["val_loss"]
validation_mae = regression_model_dropout_64_16_ensemble_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_ensemble_training_results = regression_ensemble_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

regression_ensemble_training_results.to_csv(f"DenseNet121_regression__dropout_64-16__ensemble__{epochs}-epochs.csv", index=False)
to_latex(regression_ensemble_training_results, f"DenseNet121_regression__dropout_64-16__ensemble__{epochs}-epochs")

# Zipping the checkpoints
!mkdir regression_ensemble
!mv ./DenseNet121_regression__dropout_64-16__ensemble*.h5 regression_ensemble
shutil.make_archive("DenseNet121_regression__dropout_64-16__ensemble__checkpoints", "zip", "./regression_ensemble")

In [None]:
regression_ensemble_training_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_64-16__ensemble__30-epochs.csv")
plot_history(regression_ensemble_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble.input,
              outputs=regression_model_dropout_64_16_ensemble.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__layer-64",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
image_features_regression_model_ensemble_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__ensemble__layer-64_in-256.csv",
    index_col=0)
display(image_features_regression_model_ensemble_layer_64_features.head(5))
image_features_regression_model_ensemble_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble.input,
              outputs=regression_model_dropout_64_16_ensemble.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__layer-16",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
image_features_regression_model_ensemble_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__ensemble__layer-16_in-256.csv",
    index_col=0)
display(image_features_regression_model_ensemble_layer_16_features.head(5))
image_features_regression_model_ensemble_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble.input,
              outputs=regression_model_dropout_64_16_ensemble.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__layer-64",
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
all_image_features_regression_model_ensemble_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_regression__ensemble__layer-64_in-256.csv",
    index_col=0)
display(all_image_features_regression_model_ensemble_layer_64_features.head(5))
all_image_features_regression_model_ensemble_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
aggregated_image_features_regression_model_ensemble_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__ensemble__layer-64_in-256.csv",
    index_col=0)
display(aggregated_image_features_regression_model_ensemble_layer_64_features.head(5))
aggregated_image_features_regression_model_ensemble_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble.input,
              outputs=regression_model_dropout_64_16_ensemble.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__layer-16",
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
all_image_features_regression_model_ensemble_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_regression__ensemble__layer-16_in-256.csv",
    index_col=0)
display(all_image_features_regression_model_ensemble_layer_16_features.head(5))
all_image_features_regression_model_ensemble_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
aggregated_image_features_regression_model_ensemble_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__ensemble__layer-16_in-256.csv",
    index_col=0)
display(aggregated_image_features_regression_model_ensemble_layer_16_features.head(5))
aggregated_image_features_regression_model_ensemble_layer_16_features.describe().loc["std",:].value_counts()

Let's create another ensemble, this time without Dropout layers in the output of each DenseNet121 backbone, but increasing (twice the value we had before) the Dropout layer before the Dense 64 one:

In [None]:
inp = Input((256,256,3))
cnns_outputs = []

for i in range(3):
    backbone = DenseNet121(input_tensor=inp, include_top=False, weights="imagenet")
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    cnns_outputs.append(x)
    
    # Freezing DenseNet121 backbone layers
    for j, layer in enumerate(backbone.layers):
        layer._name = f"{layer.name}_{i}"
        if (i == 0) or (i == 1 and j < 418) or (i == 2 and j < 411):
            layer.trainable = False
        else:
            layer.trainable = True
            
average = Average()(cnns_outputs)
maximum = Maximum()(cnns_outputs)

concatenation = Concatenate()([average, maximum])
x = Dropout(rate=0.5, seed=seed)(concatenation) # More dropout here
x = Dense(64, activation="relu")(x)
x = Dropout(rate=0.25, seed=seed)(x)
x = Dense(16, activation="relu")(x)
out = Dense(1, activation="linear")(x)
 
regression_model_dropout_64_16_ensemble_2 = Model(inputs=inp, outputs=out)

# To solve RuntimeError: Unable to create link (name already exists)
# https://stackoverflow.com/questions/64118599/getting-the-runtimeerror-unable-to-create-link-name-already-exists-with-a-mul
for i, w in enumerate(regression_model_dropout_64_16_ensemble_2.weights):
    split_name = w.name.split('/')
    new_name = split_name[0] + '_' + str(i) + '/' + split_name[1] + '_' + str(i)
    regression_model_dropout_64_16_ensemble_2.weights[i]._handle_name = new_name

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
regression_model_dropout_64_16_ensemble_2.compile(loss=loss, metrics=metrics, optimizer=optimizer)

In [None]:
# plot_model(regression_model_dropout_64_16_ensemble_2, show_shapes=True)
Image("../input/tfg-pet-adoption-data/ensemble_model_2_zoom.png")

In [None]:
epochs = 30
batch_size = 32

regression_ensemble_2_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
model_checkpoint = ModelCheckpoint(
    'DenseNet121_regression__ensemble__2__{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

regression_model_dropout_64_16_ensemble_2_history = regression_model_dropout_64_16_ensemble_2.fit(
                    x=regression_adoptionspeed_train_iterator_da,
                    validation_data=regression_adoptionspeed_val_iterator,
                    batch_size=batch_size, epochs=epochs,
                    callbacks=[model_checkpoint, early_stopping])

train_loss = regression_model_dropout_64_16_ensemble_2_history.history["loss"]
train_mae = regression_model_dropout_64_16_ensemble_2_history.history["mean_absolute_error"]
validation_loss = regression_model_dropout_64_16_ensemble_2_history.history["val_loss"]
validation_mae = regression_model_dropout_64_16_ensemble_2_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    regression_ensemble_2_training_results = regression_ensemble_2_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

regression_ensemble_2_training_results.to_csv(f"DenseNet121_regression__dropout_64-16__ensemble__2__{epochs}-epochs.csv", index=False)
to_latex(regression_ensemble_2_training_results, f"DenseNet121_regression__dropout_64-16__ensemble__2__{epochs}-epochs")

# Zipping the checkpoints
!mkdir regression_ensemble_2
!mv ./DenseNet121_regression__dropout_64-16__ensemble__2*.h5 regression_ensemble_2
shutil.make_archive("DenseNet121_regression__dropout_64-16__ensemble__2__checkpoints", "zip", "./regression_ensemble_2")

In [None]:
regression_ensemble_2_training_results = pd.read_csv("../input/tfg-pet-adoption-data/DenseNet121_regression__dropout_64-16__ensemble__2__30-epochs.csv")
plot_history(regression_ensemble_2_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

In [None]:
regression_model_dropout_64_16_ensemble_2 = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__ensemble__2__06-epochs_val_loss-1.160883.h5")

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble_2.input,
              outputs=regression_model_dropout_64_16_ensemble_2.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__2__layer-64",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
image_features_regression_model_ensemble_2_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__ensemble__2__layer-64_in-256.csv",
    index_col=0)
display(image_features_regression_model_ensemble_2_layer_64_features.head(5))
image_features_regression_model_ensemble_2_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble_2.input,
              outputs=regression_model_dropout_64_16_ensemble_2.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__2__layer-16",
                 save=True, debug=True, include_feats=False)

_ = ife.fit_transform(X, y)

In [None]:
image_features_regression_model_ensemble_2_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/image_features_DenseNet121_regression__ensemble__2__layer-16_in-256.csv",
    index_col=0)
display(image_features_regression_model_ensemble_2_layer_16_features.head(5))
image_features_regression_model_ensemble_2_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble_2.input,
              outputs=regression_model_dropout_64_16_ensemble_2.layers[-4].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__2__layer-64",
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
all_image_features_regression_model_ensemble_2_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_regression__ensemble__2__layer-64_in-256.csv",
    index_col=0)
display(all_image_features_regression_model_ensemble_2_layer_64_features.head(5))
all_image_features_regression_model_ensemble_2_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
aggregated_image_features_regression_model_ensemble_2_layer_64_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__ensemble__2__layer-64_in-256.csv",
    index_col=0)
display(aggregated_image_features_regression_model_ensemble_2_layer_64_features.head(5))
aggregated_image_features_regression_model_ensemble_2_layer_64_features.describe().loc["std",:].value_counts()

In [None]:
model = Model(inputs=regression_model_dropout_64_16_ensemble_2.input,
              outputs=regression_model_dropout_64_16_ensemble_2.layers[-2].output)
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
                 images_directory="../input/petfinder-adoption-prediction/train_images",
                 from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
                 model_name=f"DenseNet121_regression__ensemble__2__layer-16",
                 save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

_ = ife.fit_transform(X, y)

In [None]:
all_image_features_regression_model_ensemble_2_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/ALL_image_features_DenseNet121_regression__ensemble__2__layer-16_in-256.csv",
    index_col=0)
display(all_image_features_regression_model_ensemble_2_layer_16_features.head(5))
all_image_features_regression_model_ensemble_2_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
aggregated_image_features_regression_model_ensemble_2_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__ensemble__2__layer-16_in-256.csv",
    index_col=0)
display(aggregated_image_features_regression_model_ensemble_2_layer_16_features.head(5))
aggregated_image_features_regression_model_ensemble_2_layer_16_features.describe().loc["std",:].value_counts()

In [None]:
pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

image_features_dataframes = {
    "regression_ensemble_model_layer-64_single": image_features_regression_model_ensemble_layer_64_features,
    "regression_ensemble_model_layer-64_single_agg_mean_sum_var": aggregated_image_features_regression_model_ensemble_layer_64_features,
    "regression_ensemble_model_layer-16_single": image_features_regression_model_ensemble_layer_16_features,
    "regression_ensemble_model_layer-16_single_agg_mean_sum_var": aggregated_image_features_regression_model_ensemble_layer_16_features,
    "regression_ensemble_2_model_layer-64_single": image_features_regression_model_ensemble_2_layer_64_features,
    "regression_ensemble_2_model_layer-64_single_agg_mean_sum_var": aggregated_image_features_regression_model_ensemble_2_layer_64_features,
    "regression_ensemble_2_model_layer-16_single": image_features_regression_model_ensemble_2_layer_16_features,
    "regression_ensemble_2_model_layer-16_single_agg_mean_sum_var": aggregated_image_features_regression_model_ensemble_2_layer_16_features
}

for features_description, loaded_features in image_features_dataframes.items():
    ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
                                loaded_features=loaded_features,
                                multiple_instances_per_petid=False)
    pipeline_transformers[-5] = ('image_features_extractor', ife)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier, DenseNet121, {features_description}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)
    

pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_DenseNet121_regression_ensemble_single_and_aggregation_multiple_images")

The aggregation of features from every pet profile's images extracted from the layer 16 of ensemble 2 outclasses all the other features extracted from different models in the single split validation accuracy and QWK.

**For the evaluation of all the extracted features, check Version 76.**

In [None]:
ife = ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

pipeline_5_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_image_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('drop_petid', ColumnRemover(columns=["PetID"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_image_feats_eval))
]


xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

global_evaluation_results_5 = pd.DataFrame(
    [],
    columns=["Pipeline", "Model", "Average fit time", "Average accuracy",
             "Average QWK", "Single split accuracy", "Single split QWK"]
)

for model_desc, model in models.items():
    print(f"--------------------- MODEL: {model_desc} ---------------------")
    avg_fit_time, avg_acc, avg_qwk = evaluate_model(
        Pipeline(steps=pipeline_5_transformers + [('model', model)]),
        cv, X, y, model_type="classification")
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_5_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)
    
    global_evaluation_results_5 = global_evaluation_results_5.append({
        "Pipeline": 5,
        "Model": model_desc,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_acc,
        "Average QWK": avg_qwk,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

In [None]:
display(global_evaluation_results_5)

## Text processing

https://github.com/ssut/py-googletrans, https://github.com/ssut/py-googletrans/issues/121, https://github.com/ssut/py-googletrans/issues/280, https://py-googletrans.readthedocs.io/en/latest/

In [None]:
!pip install --upgrade language_tool_python
import language_tool_python

In [None]:
!cp -r ../input/tfg-pet-adoption-data/pycontractions-master/pycontractions-master/* ./
!python setup.py install
from pycontractions import Contractions

In [None]:
!pip install emoji --upgrade
!pip install googletrans==3.1.0a0

import emoji
import googletrans
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

There are two options to perform TF-IDF: one of such process for each language (some of them are very uncommon, so this option can be discarded) or translating the non-English descriptions to English:

In [None]:
translator = googletrans.Translator(raise_exception=True)
translations = translator.translate([
    "cute cat and adorable I just adopt him a few month back but she actually not very well adapt with my other cats, looking forward to responsible adopter who really love cats. p/s: there is macam ada isi lebih dekat ekor dia tapi kami dah check vet dan dia ni sihat. :) if you interested just call me at",
    "Jumpa dekat kawasan semak, elsa baru sahaja beranak. Now her kittens are already 6 months. Trying to search for new owner that can give them love and attention. I cant afford to take care of them anymore as i already have 10 cats.",
    "Jumpa depan pintu rumah.. Tak tahu siapa tinggalkan.. Sangat baik dan sopan.. Sudah divaksin, vitamin, cacing dan kurap."
], src="ms", dest="en")


for translation in translations:
    print(f"{translation.origin} \nvvvvvvvvvv\n {translation.text}\n\n")

In [None]:
pipeline_transformers = [
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
]

pipeline = Pipeline(steps=pipeline_transformers)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = pipeline.fit_transform(X,y)

pets_descriptions = result[["PetID", "DescriptionLanguage", "Description"]]

In [None]:
pets_descriptions["DescriptionLanguage"].value_counts(dropna=False)

**https://github.com/carpedm20/emoji, https://stackoverflow.com/a/51785357, https://stackoverflow.com/a/43146653**

In [None]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

def extract_emojis(s):
    return ''.join(c for c in s if c in emoji.UNICODE_EMOJI['en'])

In [None]:
chinese_descriptions = pets_descriptions.loc[
                pets_descriptions["DescriptionLanguage"] == "zh", "Description"].tolist()

In [None]:
for desc in chinese_descriptions:
    if extract_emojis(desc) != '':
        print(f"{desc}\n")
        print(remove_emoji(desc))
        print("\n\n")

In [None]:
train_copy = train.copy() 
train_copy["DescriptionHasEmoji"] = train_copy["Description"].apply(lambda x: extract_emojis(str(x)) != '')
train_copy["DescriptionHasEmoji"].value_counts()

In [None]:
utils.plot_vert_barplot(train_copy, "DescriptionHasEmoji", target, figsize=(10,4), display_numbers=False)

In [None]:
googletrans.LANGUAGES

In [None]:
class DescriptionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, transformations_df=None, kv_model=None, save=False, debug=False):
        self.transformations_df = transformations_df
        self.kv_model = kv_model
        self.save = save
        self.debug = debug
    
    
    def remove_text_emojis(self, text):
        return emoji.get_emoji_regexp().sub(u'', text)

    def remove_all_emojis(self):
        self.transformations_df["Description"] = self.transformations_df["Description"].apply(lambda x: x if x is np.nan else self.remove_text_emojis(str(x)))
    
    
    def get_descriptions_translations(self, X):        
        translator = googletrans.Translator(raise_exception=True)
        self.transformations_df["translation"] = ''
        
        for language in translations_df["DescriptionLanguage"].unique():
            if language is np.nan:
                continue
            if language == "en":
                condition = self.transformations_df["DescriptionLanguage"] == language
                self.transformations_df.loc[condition, "translation"] = \
                    self.transformations_df.loc[condition, "Description"]
                continue
            
            if language == "others":
                src = "auto"
            elif "zh" in language:
                src = "zh-cn"
            else:
                src = language
                
            descriptions = self.transformations_df.loc[
                self.transformations_df["DescriptionLanguage"] == language, "Description"].tolist()
            
            if self.debug:
                print(f"Translating {len(descriptions)} descriptions, src='{src}', dest='en'")
            
            try:
                translations = translator.translate(descriptions, src=src, dest="en")
                self.transformations_df.loc[self.transformations_df["DescriptionLanguage"] == language,
                    "translation"] = list(map(lambda x: x.text, translations))
            except Exception as e:
                print(e)
    
    
    def expand_contractions(self):
        cont = Contractions(kv_model=self.kv_model)
        cont.load_models()
        self.transformations_df["expanded"] = ''
        self.transformations_df["expanded"] = pd.Series(
            list(cont.expand_texts(self.transformations_df["translation"].values, precise=False)),
            index=self.transformations_df.index.copy())
    
    
    # Replace all punctuation symbols (except " ' ") by a whitespace
    def remove_punctuation(self, text, translator=str.maketrans(
            string.punctuation.replace("'", ""), " "*(len(string.punctuation)-1))):
        return text.translate(translator)
    
    
    def fit(self, X, y):
        self.transformations_df = self.transformations_df.copy()
        return self
    
    
    def transform(self, X, y=None):
        X = X.copy()
        X["Description"] = X["Description"].replace(np.nan, '')
        if self.transformations_df is None:
            self.transformations_df = X.loc[["PetID", "DescriptionLanguage", "Description"]].copy()
            self.transformations_df.set_index("PetID", inplace=True)
            self.transformations_df["Description"] = self.transformations_df["Description"].replace(np.nan, '')
            if self.debug:
                print("Removing emojis from every description...")
            self.remove_all_emojis()
            if self.debug:
                print("Translating non-english descriptions...")
            self.get_descriptions_translations()
            self.transformations_df["translation"] = self.transformations_df["translation"].apply(lambda x: str(x).replace("’", "'"))
            if self.debug:
                print("Expanding English language contractions...")
            self.expand_contractions()
            if self.debug:
                print("Removing punctuation...")
            self.transformations_df["expanded"] = self.transformations_df["expanded"].apply(lambda x: self.remove_punctuation(str(x)))
        else:
            self.transformations_df["Description"] = self.transformations_df["Description"].replace(np.nan, '')
        
        self.transformations_df["translation"] = self.transformations_df["translation"].replace(np.nan, '')
        self.transformations_df["expanded"] = self.transformations_df["expanded"].replace(np.nan, '')
        
        if self.save:
            if self.debug:
                print("Saving transformations to .csv file...")
            self.transformations_df.to_csv("descriptions_transformations.csv")

        X = X.merge(self.transformations_df["expanded"], left_index=False, right_index=True,
                    left_on="PetID", how="left")
        X.drop(["Description"], axis=1, inplace=True)
        X.rename(columns={"expanded": "Description"}, inplace=True)
        
        return X

In [None]:
translations_df = pd.read_csv("../input/tfg-pet-adoption-data/non-english_descriptions_translations.csv", index_col=0)
translations_df

In [None]:
display(translations_df.loc[translations_df["DescriptionLanguage"] == "ms"].sample(5,random_state=seed))
display(translations_df.loc[translations_df["DescriptionLanguage"] == "zh"].sample(5,random_state=seed))
display(translations_df.loc[translations_df["DescriptionLanguage"] == "zh-Hant"].sample(5,random_state=seed))

https://github.com/SanDiegoMachineLearning/QuoraInsincere, https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction, https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html, https://nlp.stanford.edu/IR-book/html/htmledition/sublinear-tf-scaling-1.html

Some of the emojis that are not detected as such are actually removed by CountVectorizer (which is used by TfidfVectorizer):

In [None]:
count_vectorizer = CountVectorizer(strip_accents='unicode')
count_vectorizer_analyzer = count_vectorizer.build_analyzer()
example_desc = translations_df.loc["f3fd77fe8", "translation"]
print(example_desc)
print("\nvvvvvvvvv\n")
print(*count_vectorizer_analyzer(example_desc), sep=', ')

(why 'I' and 'a' are discarded: https://stackoverflow.com/questions/20717641/countvectorizer-i-not-showing-up-in-vectorized-text)

**HOWEVER: Look how "can't" is left as "can" --> this is why we need to expand contractions**

https://medium.com/@lukei_3514/dealing-with-contractions-in-nlp-d6174300876b, https://github.com/ian-beaver/pycontractions, https://towardsdatascience.com/word-movers-distance-for-text-similarity-7492aeca71b0, https://radimrehurek.com/gensim/models/word2vec.html, https://github.com/RaRe-Technologies/gensim-data, https://nlp.stanford.edu/pubs/glove.pdf, https://nlp.stanford.edu/projects/glove/

**error during pip install pycontractions --> https://github.com/ian-beaver/pycontractions/issues/17, solved in https://github.com/Martin36/pycontractions (https://github.com/ian-beaver/pycontractions/pull/19, https://pypi.org/project/language-tool-python/#description), pull request not completed**

**Issue: https://github.com/facebookresearch/fastText/issues/171, pycontractions expects a binary file instead of a strings file (line 314 in https://github.com/Martin36/pycontractions/blob/master/pycontractions/contractions.py), so I have to initialize the model outside using glove -> word2vec format: https://radimrehurek.com/gensim/scripts/glove2word2vec.html**

In [None]:
_ = glove2word2vec("../input/glove-twitter/glove.twitter.27B.25d.txt", "w2v_glove.twitter.27B.25d.txt")
kv_model = KeyedVectors.load_word2vec_format("./w2v_glove.twitter.27B.25d.txt", binary=False)

In [None]:
cont = Contractions(kv_model=kv_model)
cont.load_models()

In [None]:
descriptions_sample = translations_df.sample(5, random_state=seed)
descriptions_sample["translation"].values

In [None]:
list(cont.expand_texts(descriptions_sample["translation"].values, precise=False))

In [None]:
# desc_transformer = DescriptionTransformer(kv_model=kv_model, save=True, debug=True)
# expanded_descs_df = desc_transformer.fit_transform(pets_descriptions, y=[])

In [None]:
transformations_df = pd.read_csv("../input/tfg-pet-adoption-data/descriptions_transformations.csv", index_col=0)
transformations_df["Description"] = transformations_df["Description"].replace(np.nan, '')
transformations_df["translation"] = transformations_df["translation"].replace(np.nan, '')
transformations_df["expanded"] = transformations_df["expanded"].replace(np.nan, '')
transformations_df

In [None]:
transformations_df.loc["ef14861df", "Description"]

In [None]:
transformations_df.loc["ef14861df", "translation"]

In [None]:
transformations_df.loc["ef14861df", "expanded"]

Need to replace symbol " **’** " by " **'** ", since it cannot be detected as a contraction:

In [None]:
num_symbol = 0
for index, row in transformations_df.iterrows():
    if "’" in str(row["Description"]):
        num_symbol += 1
        if num_symbol < 5:
            print(row["Description"] + "\n")
num_symbol

Replace was done using the code below, it is already included in the transformer but as expanding takes a long time, I just searched those descriptions or translations with this symbol (more than 141 replacings were done as some translations had also this symbol):

In [None]:
# for index, row in tqdm(transformations_df.iterrows()):
#     pet_id = index
#     if "’" in str(row["translation"]):
#         transformations_df.loc[pet_id, "translation"] = row["translation"].replace("’", "'")
#         transformations_df.loc[pet_id, "expanded"] = list(cont.expand_texts(
#             [transformations_df.loc[pet_id, "translation"]], precise=False))[0]
# transformations_df.to_csv("descriptions_transformations.csv")

**Summing up, current transformations: replace np.nan descriptions by '', remove emojis, translate non-English descriptions, expand English contractions using word embeddings and Word Mover's Distance (pycontractions), replace punctuation (except " ' ", as if the previous step didn't replace it, it was because it indicates genitive or other circunstances).**

### TF-IDF

In [None]:
class CustomTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, ngram_range=(1,1), max_df=1.0, min_df=1, sublinear_tf=False,
                 svd_n_components=16, seed=seed, debug=False):
        self.ngram_range = ngram_range
        self.max_df = max_df
        self.min_df = min_df
        self.sublinear_tf = sublinear_tf
        self.svd_n_components = svd_n_components
        self.seed = seed
        self.debug = debug
        
    
    def fit(self, X, y):
        self.tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode',
                                                ngram_range=self.ngram_range,
                                                max_df=self.max_df,
                                                min_df=self.min_df,
                                                use_idf=True,
                                                smooth_idf=True,
                                                sublinear_tf=self.sublinear_tf)
        self.svd = TruncatedSVD(n_components=self.svd_n_components,
                                random_state=self.seed)
        
        corpus = X["Description"] #.replace(np.nan, '').values # TfidfVectorizer does not accept np.nan values
        if self.debug:
            print("Fitting TfidfVectorizer...")
        corpus_transformed = self.tfidf_vectorizer.fit_transform(corpus)
        if self.debug:
            print(f"Shape of transformed training corpus: {corpus_transformed.shape}")
            print(f"Fitting TruncatedSVD, {self.svd_n_components} components, with transformed training corpus...")
        self.svd.fit(corpus_transformed)
        
        return self
    
    
    def transform(self, X, y=None):
        X = X.copy()
        corpus = X["Description"] #.replace(np.nan, '').values
        if self.debug:
            print("Transforming corpus with TfidfVectorizer...")
        corpus_transformed = self.tfidf_vectorizer.transform(corpus)
        desc_columns = [f"desc_{i}" for i in range(self.svd_n_components)]
        if self.debug:
            display(corpus_transformed)
            print(f"Transformed corpus shape: {corpus_transformed.shape}")
            print(f"Using fitted TruncatedSVD on transformed corpus")
        desc_features_df = pd.DataFrame(self.svd.transform(corpus_transformed),
                                        index=X.index.copy(),
                                        columns=desc_columns)
        X = X.merge(desc_features_df, left_index=True, right_index=True, how="left")
        return X

In [None]:
columns_to_be_removed_desc_feats_eval = ["Name", "Breed1", "Breed2", "Gender", "Color1",
                                          "Color2", "Color3", "Vaccinated", "Dewormed",
                                          "Sterilized", "State", "RescuerID",
                                          "MaturitySize", "FurLength", "Health",
                                          "ImageMetadataDescription"]

numeric_columns_desc_feats_eval = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt",
                                    "StateGDP", "RescuerCount", "DescriptionLength"]

Let's take the number of SVD componentes outside the hyperparameter tuning and select one value now in order to reduce the number of combination and the fit time:

In [None]:
ife = ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer', DescriptionTransformer(transformations_df=transformations_df)),
    None,
    ('drop_petid_desc', ColumnRemover(columns=["PetID", "DescriptionLanguage", "Description"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval))
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])

for svd_n_components in [8, 16, 24, 32, 48]:
    tfidf_vectorizer = CustomTfidfVectorizer(ngram_range=(1,2),
                                             svd_n_components=svd_n_components,
                                             seed=seed)
    pipeline_transformers[-5] = ('tfidf_vectorizer', tfidf_vectorizer)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier, TF-IDF ngram (1,2) SVD-{svd_n_components}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)


pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_TFIDF_ngram-1-2_varying-SVD_n-components")

We will fix the number of SVD components to 16 for the TF-IDF matrix.

### Pre-trained word embeddings + CNN

https://radimrehurek.com/gensim/models/word2vec.html

https://towardsdatascience.com/word-embeddings-exploration-explanation-and-exploitation-with-code-in-python-5dac99d5d795

https://www.kaggle.com/matleonard/word-vectors

https://tfhub.dev/google/collections/universal-sentence-encoder/1, https://www.tensorflow.org/text/guide/word_embeddings (https://github.com/tensorflow/hub/issues/244, https://github.com/tensorflow/hub/issues/572)

**We will use this model: https://keras.io/examples/nlp/pretrained_word_embeddings/**

In [None]:
from keras.initializers import Constant
from keras.layers import Conv1D, Embedding, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.experimental.preprocessing import TextVectorization
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
X_train_text = transformations_df.loc[X_train_CNN["PetID"].values, "expanded"].copy().values
y_train_text = y_train_CNN.copy().values
X_val_text = transformations_df.loc[X_val_CNN["PetID"].values, "expanded"].copy().values
y_val_text = y_val_CNN.copy().values

In [None]:
print(len(X_train_text))
X_train_text[:5]

In [None]:
print(len(y_train_text))
y_train_text[:5]

In [None]:
print(len(X_val_text))
X_val_text[:5]

In [None]:
print(len(y_val_text))
y_val_text[:5]

In [None]:
num_tokens_train_text = list(map(lambda x: len(x.split()), X_train_text))
print(f"Mean number of tokens: {np.mean(num_tokens_train_text)}")
print(f"Max number of tokens: {np.max(num_tokens_train_text)}")
print(f"Median number of tokens: {np.median(num_tokens_train_text)}")

In [None]:
vectorizer = TextVectorization(max_tokens=30000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train_text).batch(128)
vectorizer.adapt(text_ds)
vectorizer.get_vocabulary()[:5]

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

embeddings_index = {}
with open("../input/glove-twitter/glove.twitter.27B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

count_print = 0
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        if count_print < 20:
            print(word)
            count_print += 1

print(f"Converted {hits} words ({misses} misses)")

In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

Let's try the same FC network we used in image features extraction at the end of the CNN:

In [None]:
int_sequences_input = Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = Conv1D(128, 5, activation="relu")(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation="relu")(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation="relu")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(rate=0.5, seed=seed)(x)
x = Dense(16, activation="relu")(x)
preds = Dense(1, activation="linear")(x)

cnn_word_embeddings_model = Model(int_sequences_input, preds)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
cnn_word_embeddings_model.compile(loss=loss, metrics=metrics, optimizer=optimizer)

cnn_word_embeddings_model.summary()

In [None]:
X_train_text_index = vectorizer(np.array([[s] for s in X_train_text])).numpy()
X_val_text_index = vectorizer(np.array([[s] for s in X_val_text])).numpy()

y_train_text_index = np.array(y_train_text)
y_val_text_index = np.array(y_val_text)

In [None]:
epochs = 15
batch_size = 128

cnn_word_embeddings_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
model_checkpoint = ModelCheckpoint(
    'cnn_word_embeddings__{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

cnn_word_embeddings_history = cnn_word_embeddings_model.fit(
    x=X_train_text_index,
    y=y_train_text_index,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val_text_index, y_val_text_index),
    callbacks=[model_checkpoint, early_stopping]
)

train_loss = cnn_word_embeddings_history.history["loss"]
train_mae = cnn_word_embeddings_history.history["mean_absolute_error"]
validation_loss = cnn_word_embeddings_history.history["val_loss"]
validation_mae = cnn_word_embeddings_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    cnn_word_embeddings_training_results = cnn_word_embeddings_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

cnn_word_embeddings_training_results.to_csv(f"cnn_word_embeddings__{epochs}-epochs.csv", index=False)
to_latex(cnn_word_embeddings_training_results, f"cnn_word_embeddings__{epochs}-epochs")

# Zipping the checkpoints
!mkdir cnn_word_embeddings
!mv ./cnn_word_embeddings*.h5 cnn_word_embeddings
shutil.make_archive("cnn_word_embeddings__checkpoints", "zip", "./cnn_word_embeddings")

In [None]:
cnn_word_embeddings_training_results = pd.read_csv("../input/tfg-pet-adoption-data/cnn_word_embeddings__15-epochs.csv")
plot_history(cnn_word_embeddings_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

The model overfits, let's apply the idea proposed here: https://realpython.com/python-keras-text-classification/#hyperparameters-optimization in order to check different configurations of each layer:

In [None]:
def create_cnn_word_emb_model(num_filters, kernel_size, num_intermediate_conv_max,
                              size_dense_1, dropout_rate, size_dense_2, lr):
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = embedding_layer(int_sequences_input)
    x = Conv1D(num_filters, kernel_size, activation="relu")(embedded_sequences)
    x = MaxPooling1D(kernel_size)(x)
    for i in range(num_intermediate_conv_max):
        x = Conv1D(num_filters, kernel_size, activation="relu")(x)
        x = MaxPooling1D(kernel_size)(x)
    x = Conv1D(num_filters, kernel_size, activation="relu")(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(size_dense_1, activation="relu")(x)
    x = Dropout(rate=dropout_rate, seed=seed)(x)
    x = Dense(size_dense_2, activation="relu")(x)
    preds = Dense(1, activation="linear")(x)

    model = Model(int_sequences_input, preds)

    optimizer = Adam(lr=lr)
    metrics = ["mean_absolute_error"]
    loss = "mean_squared_error"

    model.compile(loss=loss, metrics=metrics, optimizer=optimizer)
    return model

In [None]:
# Parameter grid for grid search
param_grid = dict(num_filters=[64,128,192], kernel_size=[3,5], num_intermediate_conv_max=[0,1],
                  size_dense_1=[64,128], dropout_rate=[0.25,0.5], size_dense_2=[16,32],
                  lr=[0.001,0.0001])

The previous grid makes 192 possible combinations, so let's specify RandomizedSearchCV to select just 50 of them and run a stratified 4-CV inside X_train_text:

In [None]:
epochs = 6
batch_size = 128

model = KerasRegressor(build_fn=create_cnn_word_emb_model,
                        epochs=epochs, batch_size=128,
                        verbose=False)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                          cv=StratifiedKFold(n_splits=4), verbose=1,
                          n_iter=50, random_state=seed)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_result = grid.fit(X_train_text_index, y_train_text_index)

# Evaluate testing set
test_accuracy = grid.score(X_val_text_index, y_val_text_index)

random_search_results = pd.DataFrame.from_dict(grid.cv_results_)
random_search_results.to_csv("random_search_cnn_word_emb_results.csv")

s = ('Best Loss: {:.4f}\n{}\nTest (X_val_text_index) Loss: {:.4f}\n\n')
output_string = s.format(
    grid_result.best_score_,
    grid_result.best_params_,
    test_accuracy)
print(output_string)

In [None]:
random_search_results = pd.read_csv("../input/tfg-pet-adoption-data/random_search_cnn_word_emb_results.csv", index_col=0)
random_search_results.sort_values(["rank_test_score"]).head(3)

In [None]:
int_sequences_input = Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = Conv1D(64, 3, activation="relu")(embedded_sequences)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation="relu")(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation="relu")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(rate=0.5, seed=seed)(x)
x = Dense(16, activation="relu")(x)
preds = Dense(1, activation="linear")(x)

cnn_word_embeddings_model_hyper_params_tuned = Model(int_sequences_input, preds)

optimizer = Adam(lr=0.001)
metrics = ["mean_absolute_error"]
loss = "mean_squared_error"
    
cnn_word_embeddings_model_hyper_params_tuned.compile(loss=loss, metrics=metrics, optimizer=optimizer)

cnn_word_embeddings_model_hyper_params_tuned.summary()

In [None]:
epochs = 30
batch_size = 128

cnn_word_embeddings_training_results = pd.DataFrame([], columns=["loss",
                        "mean_absolute_error", "val_loss",
                        "val_mean_absolute_error"])

early_stopping = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=5)
model_checkpoint = ModelCheckpoint(
    'cnn_word_embeddings__hyper-params-tuned__{epoch:02d}-epochs_val_loss-{val_loss:02f}.h5',
    monitor='val_loss', save_best_only=True)

cnn_word_embeddings_history = cnn_word_embeddings_model_hyper_params_tuned.fit(
    x=X_train_text_index,
    y=y_train_text_index,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val_text_index, y_val_text_index),
    callbacks=[model_checkpoint, early_stopping]
)

train_loss = cnn_word_embeddings_history.history["loss"]
train_mae = cnn_word_embeddings_history.history["mean_absolute_error"]
validation_loss = cnn_word_embeddings_history.history["val_loss"]
validation_mae = cnn_word_embeddings_history.history["val_mean_absolute_error"]

for i in range(len(train_loss)):
    cnn_word_embeddings_training_results = cnn_word_embeddings_training_results.append({
        "loss": train_loss[i],
        "mean_absolute_error": train_mae[i],
        "val_loss": validation_loss[i],
        "val_mean_absolute_error": validation_mae[i]
    }, ignore_index=True)

cnn_word_embeddings_training_results.to_csv(f"cnn_word_embeddings__hyper-params-tuned__{epochs}-epochs.csv", index=False)
to_latex(cnn_word_embeddings_training_results, f"cnn_word_embeddings__hyper-params-tuned__{epochs}-epochs")

# Zipping the checkpoints
!mkdir cnn_word_embeddings_hyper_params_tuned
!mv ./cnn_word_embeddings__hyper*.h5 cnn_word_embeddings_hyper_params_tuned
shutil.make_archive("cnn_word_embeddings__hyper-params-tuned__checkpoints", "zip", "./cnn_word_embeddings_hyper_params_tuned")

In [None]:
cnn_word_embeddings_training_results = pd.read_csv("../input/tfg-pet-adoption-data/cnn_word_embeddings__hyper-params-tuned__30-epochs.csv")
plot_history(cnn_word_embeddings_training_results, ["loss", "mean_absolute_error"], 1, 2, (12,3))

The validation loss is still very high, and the best value didn't really improve the one of the original model. Let's check how much information the extracted features can give:

In [None]:
string_input = Input(shape=(1,), dtype="string")
x = vectorizer(string_input)

cnn_word_embeddings_model_hyper_params_tuned_output_16 = Model(
    inputs=cnn_word_embeddings_model_hyper_params_tuned.input,
    outputs=cnn_word_embeddings_model_hyper_params_tuned.layers[-2].output
)

preds = cnn_word_embeddings_model_hyper_params_tuned_output_16(x)
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_16 = Model(string_input, preds)
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_16.save(
    "end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-16__05-epochs_val_loss-1.356145.tf",
    save_format='tf')
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_16.summary()

In [None]:
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_16.layers[-1].layers

In [None]:
string_input = Input(shape=(1,), dtype="string")
x = vectorizer(string_input)

cnn_word_embeddings_model_hyper_params_tuned_output_128 = Model(
    inputs=cnn_word_embeddings_model_hyper_params_tuned.input,
    outputs=cnn_word_embeddings_model_hyper_params_tuned.layers[-4].output
)

preds = cnn_word_embeddings_model_hyper_params_tuned_output_128(x)
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_128 = Model(string_input, preds)
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_128.save(
    "end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-128__05-epochs_val_loss-1.356145.tf",
    save_format='tf')
end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_128.summary()

In [None]:
# With save_format='tf', there are several files and subdirectories
shutil.make_archive("end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-16__05-epochs_val_loss-1.356145.tf", "zip", "./end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-16__05-epochs_val_loss-1.356145.tf")

In [None]:
shutil.make_archive("end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-128__05-epochs_val_loss-1.356145.tf", "zip", "./end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-128__05-epochs_val_loss-1.356145.tf")

In [None]:
end_to_end_copy = load_model("./end_to_end_cnn_word_embeddings__hyper-params-tuned__layer-16__05-epochs_val_loss-1.356145.tf")
end_to_end_copy.summary()

**Note: when loading the model, the embedding layer is left as trainable, so we have to revert that:**

In [None]:
end_to_end_copy.layers[-1].layers[1].trainable = False
end_to_end_copy.summary()
del end_to_end_copy

In [None]:
class DescriptionFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, model=None, model_name=None, loaded_features=None,
                 save=False, include_feats=True, debug=False):
        self.model = model
        self.model_name = model_name
        self.save = save
        self.include_feats = include_feats
        self.loaded_features = loaded_features
        self.debug = debug       
    
    
    def extract(self, X):        
        features = {}
        iterable = X.iterrows()
        if self.debug:
            iterable = tqdm(iterable, total=X.shape[0])
        for index, row in iterable:
            pet_id = row["PetID"]
            desc_features = self.model.predict([[row["Description"]]])[0]
            features[pet_id] = desc_features
        return pd.DataFrame.from_dict(features, orient='index')
    
    
    def fit(self, X, y):
        if self.model is None and self.loaded_features is None:
            raise ValueError("'model' and 'loaded_features' cannot be None at the same time")
        if self.model is not None and self.model_name is None:
            raise ValueError("'model_name' cannot be None if 'model is not None'")
        
        return self
    
    
    def save_features(self, features_df):
        if self.model is not None:
            new_filename = f'description_features_{self.model_name}.csv'
        try:
            features_df.to_csv(new_filename)
            if self.debug:
                print(f"File {new_filename} succesfully saved")
        except Exception as e:
            print(e)
    
    
    def transform(self, X, y=None):
        X = X.copy()
        X["Description"].replace(np.nan, '', inplace=True)
        if self.loaded_features is not None:
            features_df = self.loaded_features.copy()
        elif self.model is not None:
            features_df = self.extract(X)
        
        if self.debug:
            print(f"Number of description features: {features_df.shape[1]}\n")
            display(features_df.head())
            
        if self.save:
            self.save_features(features_df)
            
        if self.include_feats:
            features_df.rename(lambda x: f"desc_{x}", axis=1, inplace=True)
            X = X.merge(features_df, left_index=False, right_index=True,
                        left_on="PetID", how="left")
#             if "PetID" in X.columns:
#                 X.drop(["PetID"], axis=1, inplace=True)
                
        return X

In [None]:
dfe = DescriptionFeatureExtractor(
    model=end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_16,
    model_name=f"cnn_word_embeddings_model__hyper-params-tuned__layer-16_05-epochs",
    save=True, debug=True, include_feats=False
)

_ = dfe.fit_transform(X, y)

In [None]:
description_features_cnn_tuned_layer_16 = pd.read_csv(
    "../input/tfg-pet-adoption-data/description_features_cnn_word_embeddings_model__hyper-params-tuned__layer-16_05-epochs.csv",
    index_col=0)
display(description_features_cnn_tuned_layer_16.head(5))
description_features_cnn_tuned_layer_16.describe().loc["std",:].value_counts()

In [None]:
dfe = DescriptionFeatureExtractor(
    model=end_to_end_cnn_word_embeddings_model_hyper_params_tuned_output_128,
    model_name=f"cnn_word_embeddings_model__hyper-params-tuned__layer-128_05-epochs",
    save=True, debug=True, include_feats=False
)

_ = dfe.fit_transform(X, y)

In [None]:
description_features_cnn_tuned_layer_128 = pd.read_csv(
    "../input/tfg-pet-adoption-data/description_features_cnn_word_embeddings_model__hyper-params-tuned__layer-128_05-epochs.csv",
    index_col=0)
display(description_features_cnn_tuned_layer_128.head(5))
description_features_cnn_tuned_layer_128.describe().loc["std",:].value_counts()

In [None]:
ife = ImageFeatureExtractor(construct_from_cnn_backbone=False,
        loaded_features=aggregated_image_features_regression_model_ensemble_2_layer_16_features)

pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer', DescriptionTransformer(transformations_df=transformations_df)),
    None,
    ('drop_petid', ColumnRemover(columns=["PetID", "DescriptionLanguage", "Description"])),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval)),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer())
]

evaluation_results = pd.DataFrame([], columns=["Model description",
                        "Average fit time", "Average accuracy", "Average QWK",
                        "Single split accuracy", "Single split QWK"])


features_dict = {
    "cnn_n_filters-64_kernel_size-3_128-16_layer-16": description_features_cnn_tuned_layer_16,
    "cnn_n_filters-64_kernel_size-3_128-16_layer-128": description_features_cnn_tuned_layer_128
}

for features_desc, features in features_dict.items():
    dfe = DescriptionFeatureExtractor(loaded_features=features)
    pipeline_transformers[-5] = ('DescriptionFeatureExtractor', dfe)

    model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)
    model_description = f"XGBClassifier, word embeddings {features_desc}"
    print(f"\n\n*************** {model_description} ***************")
    avg_fit_time, avg_accuracy, avg_QWK = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=True,
                display_plots=False)

    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)

    evaluation_results = evaluation_results.append({
        "Model description": model_description,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_accuracy,
        "Average QWK": avg_QWK,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)


pd.set_option('display.max_colwidth', None)
display(evaluation_results)
pd.set_option('display.max_colwidth', 50)
to_latex(evaluation_results, "evaluation_results_XGBClassifier_cnn_word_embeddings_hyper-params-tuned")

Given the bad results of the CNNs with word embeddings in the single split validation, we will definitively use TF-IDF to extract information from the Description text.

In [None]:
ife = ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

tfidf_vectorizer = CustomTfidfVectorizer(ngram_range=(1,2),
                                         svd_n_components=16,
                                         seed=seed)

pipeline_6_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer', DescriptionTransformer(transformations_df=transformations_df)),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('drop_petid_desc', ColumnRemover(columns=["PetID", "DescriptionLanguage", "Description"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval))
]


xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

global_evaluation_results_6 = pd.DataFrame(
    [],
    columns=["Pipeline", "Model", "Average fit time", "Average accuracy",
             "Average QWK", "Single split accuracy", "Single split QWK"]
)

for model_desc, model in models.items():
    print(f"--------------------- MODEL: {model_desc} ---------------------")
    avg_fit_time, avg_acc, avg_qwk = evaluate_model(
        Pipeline(steps=pipeline_6_transformers + [('model', model)]),
        cv, X, y, model_type="classification")
    
    single_accuracy, single_QWK = evaluate_model_single_split(
                Pipeline(steps=pipeline_6_transformers + [('model', model)]),
                X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                model_type="classification", display_results=True)
    
    global_evaluation_results_6 = global_evaluation_results_6.append({
        "Pipeline": 6,
        "Model": model_desc,
        "Average fit time": avg_fit_time,
        "Average accuracy": avg_acc,
        "Average QWK": avg_qwk,
        "Single split accuracy": single_accuracy,
        "Single split QWK": single_QWK
    }, ignore_index=True)

In [None]:
global_evaluation_results = global_evaluation_results_1.append(
    [global_evaluation_results_2, global_evaluation_results_3,
     global_evaluation_results_4, global_evaluation_results_5,
     global_evaluation_results_6],
    ignore_index=True
)
global_evaluation_results.to_csv("global_evaluation_results.csv", index=False)
global_evaluation_results

## Feature importance and subset selection

In [None]:
!cp -r ../input/tfg-pet-adoption-data/rdc-master/rdc-master/* ./
!python setup.py install
from rdc import rdc

Implemented strategies: ranker (mutual information between each predictor an AdoptionSpeed, without interaction between predictor variables), rf_gini and rf_permutation (importance when fitting a Random Forest, with default parameters, using the Gini split criterion to compute the importances or the OOB importances estimation with permutations, respectively) and mrmr_fcq and mrmr_frq (two versions of MRMR, the first one with Pearson Correlation and the second one with the Randomized Dependent Coefficient, both to measure interaction between predictor variables).

https://scikit-learn.org/stable/modules/feature_selection.html#sequential-feature-selection

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html#sklearn.feature_selection.mutual_info_regression

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif

https://github.com/scikit-learn/scikit-learn/blob/15a949460/sklearn/feature_selection/_sequential.py#L15

https://towardsdatascience.com/feature-selection-how-to-throw-away-95-of-your-data-and-get-95-accuracy-ad41ca016877

https://towardsdatascience.com/top-7-feature-selection-techniques-in-machine-learning-94e08730cd09

https://es.wikipedia.org/wiki/Maldici%C3%B3n_de_la_dimensi%C3%B3n

(Didn't use this, as it was not done on OOB: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py, https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-importance)

**!!** https://explained.ai/rf-importance/index.html#3, https://github.com/parrt/random-forest-importances

 https://www.stat.berkeley.edu/~breiman/OOBestimation.pdf
 
**(When doing OOB feauture importance, some of them are negative: the mean error is smaller after a random permutation! So they are far from being important features)**

https://towardsdatascience.com/mrmr-explained-exactly-how-you-wished-someone-explained-to-you-9cf4ed27458b

https://arxiv.org/pdf/1908.05376.pdf

https://arxiv.org/pdf/1304.7717.pdf

https://github.com/garydoranjr/rdc

**RDC summary in question -->** https://stats.stackexchange.com/questions/161397/about-the-randomized-dependence-coefficient

In [None]:
class FeatureSubsetSelection(BaseEstimator, TransformerMixin):
    def __init__(self, strategy, base_numeric_columns, seed, k=None, frac=None,
                 debug=False):
        self.strategy = strategy
        self.base_numeric_columns = base_numeric_columns
        self.seed = seed
        self.k = k
        self.frac = frac
        self.debug = debug
        
    
    def ranker(self, X, y):
        mutual_info_target = list(zip(
            X.columns,
            mutual_info_classif(X, y,
                discrete_features=self.index_discrete_columns,
                random_state=self.seed)))
        self.ranking = sorted(mutual_info_target, key=lambda x: x[1], reverse=True)
        
        if self.debug:
            display(self.ranking)
         
        self.selected = [x[0] for x in self.ranking[:self.k]]
    
    
    def rf_gini(self, X, y):
        rf = RandomForestClassifier(n_jobs=-1, random_state=self.seed)
        rf.fit(X, y)
        self.ranking = sorted(list(zip(X.columns, rf.feature_importances_)),
                              key=lambda x: x[1],
                              reverse=True)
        if self.debug:
            display(self.ranking)
         
        self.selected = [x[0] for x in self.ranking[:self.k]]
    
    
    def rf_permutation(self, X, y):
        rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=self.seed)
        rf.fit(X, y)
        imp = rfpimp.oob_importances(rf, X, y, n_samples=-1)
        self.ranking = list(zip(imp.index, imp["Importance"]))
        
        if self.debug:
            display(self.ranking)
         
        self.selected = [x[0] for x in self.ranking[:self.k]]      

    
    def mrmr(self, X, y, correlation):
        f_test = pd.Series(f_classif(X,y)[0], index=X.columns)
        if self.debug:
            print("F-values:")
            display(f_test)
        # Pearson correlation or Randomized Dependence Coefficient:
        self.dependence_predictors = pd.DataFrame(0.000001, index=X.columns, columns=X.columns)
        self.selected = []
        not_selected = list(X.columns)
        
        for i in range(self.k):
            if i == 0:
                feature_best_f_value = f_test.idxmax()
                self.selected.append(feature_best_f_value)
                not_selected.remove(feature_best_f_value)
                if self.debug:
                    print(f"Selected feature {feature_best_f_value} (F-value: {f_test.max()})")
                continue
            
            last_added = self.selected[-1]
            if correlation:
                self.dependence_predictors.loc[not_selected, last_added] = \
                        X[not_selected].corrwith(X[last_added]).abs().clip(0.000001)
            else:
                for var in not_selected:
                    self.dependence_predictors.loc[var, last_added] = \
                            rdc(X[var].to_numpy(), X[last_added].to_numpy())
                self.dependence_predictors.loc[not_selected, last_added] = \
                    self.dependence_predictors.loc[not_selected, last_added].abs().clip(0.000001)
            
            scores = f_test.loc[not_selected] / self.dependence_predictors.loc[not_selected,
                                                                    self.selected].mean(axis=1)
            
            best_feature_i = scores.index[scores.argmax()]
            self.selected.append(best_feature_i)
            not_selected.remove(best_feature_i)
            
            if self.debug:
                print(f"Selected feature {best_feature_i} ({scores.max()})")
            
    
    def fit(self, X, y):
        if self.k is None and self.frac is None:
            raise ValueError(f"Parameters 'k' and 'frac' cannot be None at the same time")
        if self.k == 0 or self.frac == 0:
            raise ValueError(f"Neither 'k' nor 'frac' can be 0")
        if self.k is not None and self.k > len(X.columns):
            raise ValueError(f"Parameter 'k' cannot be greater than the total number of features")
        if self.strategy not in {"ranker", "rf_gini", "rf_permutation",
                                 "mrmr_fcq", "mrmr_frq"}:
            raise ValueError(f"{self.strategy} is not a valid FSS strategy")
            
        if self.strategy == "ranker":
            self.numeric_columns = self.base_numeric_columns + \
                list(filter(lambda x: ("Breed" in str(x) and "Fur" not in str(x)) \
                            or "_ordinal" in str(x) or "img_" in str(x) \
                            or "desc_" in str(x) or "_num" in str(x) \
                            or "_mean" in str(x) or "_sum" in str(x) \
                            or "_var" in str(x),
                            X.columns))
            self.discrete_columns = list(filter(lambda x: x not in self.numeric_columns, X.columns))
            self.index_discrete_columns = [i for i, x in enumerate(X.columns)
                                                if x in self.discrete_columns]
        
        if self.frac is not None:
            self.k = round(len(X.columns) * self.frac)
        
        if self.k == len(X.columns):
            self.selected = X.columns
        elif self.strategy == 'ranker':
            self.ranker(X, y)
        elif self.strategy == 'rf_gini':
            self.rf_gini(X, y)
        elif self.strategy == 'rf_permutation':
            self.rf_permutation(X, y)
        elif self.strategy == 'mrmr_fcq':
            self.mrmr(X, y, correlation=True)
        elif self.strategy == 'mrmr_frq':
            self.mrmr(X, y, correlation=False)
        
        return self
    
    
    def transform(self, X, y=None):
        X = X.copy()
        return X[self.selected]

In [None]:
ife = ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

tfidf_vectorizer = CustomTfidfVectorizer(ngram_range=(1,2),
                                         svd_n_components=16,
                                         seed=seed)

pipeline_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer', DescriptionTransformer(transformations_df=transformations_df)),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('drop_petid_desc', ColumnRemover(columns=["PetID", "DescriptionLanguage", "Description"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval)),
    None
]


xgb_classifier = xgb.XGBClassifier(eval_metric='mlogloss', random_state=seed, n_jobs=-1,
                          use_label_encoder=False)

random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=seed)

svc_clf = SVC(break_ties=True, probability=True, random_state=seed)

logistic_regression = LogisticRegression(n_jobs=-1, random_state=seed)

models = {
    "XGBClassifier": xgb_classifier,
    "RandomForestClassifier": random_forest_clf,
    "SVC (rbf kernel)": svc_clf,
    "Logistic Regression": logistic_regression
}

fss_evaluation_results = pd.DataFrame(
    [],
    columns=["FSS strategy", "Model", "Fraction of features", "Average fit time",
             "Average accuracy", "Average QWK", "Single split accuracy",
             "Single split QWK"]
)

fss_strategies = ["ranker", "rf_gini", "rf_permutation", "mrmr_fcq",
                  "mrmr_frq"]

for model_desc, model in models.items():
    for fss_strategy in fss_strategies:
        for frac in [0.2, 0.4, 0.6, 0.8, 1]:
            print(f"--------------------- MODEL: {model_desc}, FSS STRATEGY: {fss_strategy}, FRACTION: {frac} ---------------------")
            
            fss = FeatureSubsetSelection(
                strategy=fss_strategy,
                base_numeric_columns=numeric_columns_desc_feats_eval,
                seed=seed,
                frac=frac
            )
            pipeline_transformers[-1] = ('fss', fss)
            
            avg_fit_time, avg_acc, avg_qwk = evaluate_model(
                Pipeline(steps=pipeline_transformers + [('model', model)]),
                cv, X, y, model_type="classification", display_results=False)

            single_accuracy, single_QWK = evaluate_model_single_split(
                    Pipeline(steps=pipeline_transformers + [('model', model)]),
                    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
                    model_type="classification", display_results=False)

            fss_evaluation_results = fss_evaluation_results.append({
                "Model": model_desc,
                "FSS strategy": fss_strategy,
                "Fraction of features": frac,
                "Average fit time": avg_fit_time,
                "Average accuracy": avg_acc,
                "Average QWK": avg_qwk,
                "Single split accuracy": single_accuracy,
                "Single split QWK": single_QWK
            }, ignore_index=True)

In [None]:
# fss_evaluation_results.to_csv("fss_evaluation_results.csv")
fss_evaluation_results = pd.read_csv("../input/tfg-pet-adoption-data/fss_evaluation_results.csv")
fss_evaluation_results

In [None]:
_, ax = plt.subplots(nrows=4, ncols=5, figsize=(25,25))
for i, model_desc in enumerate(models):
    data = fss_evaluation_results.loc[
        fss_evaluation_results["Model"] == model_desc]
    sns.lineplot(data=data, x="Fraction of features", y="Average fit time",
                 hue="FSS strategy", ax=ax[i,0])
    sns.lineplot(data=data, x="Fraction of features", y="Average accuracy",
                           hue="FSS strategy", ax=ax[i,1])
    sns.lineplot(data=data, x="Fraction of features", y="Average QWK",
                           hue="FSS strategy", ax=ax[i,2])
    ax[i,2].set_title(model_desc, fontsize=16)
    sns.lineplot(data=data, x="Fraction of features", y="Single split accuracy",
                 hue="FSS strategy", ax=ax[i,3])
    sns.lineplot(data=data, x="Fraction of features", y="Single split QWK",
                           hue="FSS strategy", ax=ax[i,4])
plt.suptitle("FSS evaluation results", fontsize=20, y=0.92)
plt.subplots_adjust(hspace=0.5, wspace=0.3)
plt.show()

## Classification or ordinal regression?

In [None]:
ife = ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

tfidf_vectorizer = CustomTfidfVectorizer(ngram_range=(1,2),
                                         svd_n_components=16,
                                         seed=seed)

pipeline_6_transformers = [
    ('replace_breeds', LeftJoinReplace(values_dict=breeds_dict,
                                      variables=["Breed1", "Breed2"])),
    ('replace_colors', LeftJoinReplace(values_dict=colors_dict,
                                      variables=["Color1", "Color2", "Color3"])),
    ('replace_states', LeftJoinReplace(values_dict=states_dict,
                                     variables=["State"])),
    ('replace_by_strings', FunctionTransformer(func=replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=has_significant_name)),
    ('pure_breed', FunctionTransformer(func=has_pure_breed)),
    ('breed_matches_fur_length', FunctionTransformer(func=breed_matches_fur_length)),
    ('impute_breed', BreedImputer()),
    ('include_prof_im_metadata', IncludeProfileImageMetadata(profile_image_metadata)),
    ('correct_wrong_type', CorrectWrongType(breeds)),
    ('encode_breed', BreedEncoding(enc_type="target_and_frequency")),
    ('ordinal_vars_encoder', OrdinalVariableEncoder(columns=["MaturitySize", "FurLength", "Health"],
                                enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp', ReplaceState(gdp_per_capita=gdp_per_capita, impute_nan_value=46450)),
    ('rescuer_count', ReplaceRescuerID()),
    ('discretizer', CustomDiscretizer(bins_age=-1, quantity=False, fee=False, video_amt=False,
                                     photo_amt=False)),
    ('description_length', FunctionTransformer(func=include_description_length)),
    ('include_desc_metadata', IncludeDescriptionMetadata(description_metadata=description_metadata)),
    ('correct_desc_language', CorrectDescriptionLanguage()),
    ('one_hot_encoder', CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                            "Color3", "Vaccinated", "Dewormed", "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties', IncludeProfileImageProperties(profile_image_properties)),
    ('drop_columns', ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio', FunctionTransformer(func=include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer', DescriptionTransformer(transformations_df=transformations_df)),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('drop_petid_desc', ColumnRemover(columns=["PetID", "DescriptionLanguage", "Description"])),
    ('useless_vars_remover', UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props', CustomIterativeImputer()),
    ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval))
]

Spot the difference between this:

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=seed,
                         seed=seed, n_jobs=-1)

_ = evaluate_model(Pipeline(steps=pipeline_6_transformers + [('model', model)]),
               cv, X, y, model_type="regression", coefficients=[0.5,1.5,2.5,3.5])

And this?:

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=seed,
                         seed=seed, n_jobs=-1)

_ = evaluate_model(Pipeline(steps=pipeline_6_transformers + [('model', model)]),
               cv, X, y, model_type="regression", coefficients=[1.15,2.15,2.45,2.85])

Well, in both cases we are training XGBoost with regression as the objective function; then in order to estimate how accurate we have been, we round the obtained predictions to of the original class values: 0, 1, 2, 3 or 4. The main difference is that in the first case, we round up the values in a common way:

* If it is smaller than or equal to 0.5, then we round to 0
* Between 0.5 and 1.5 (included), 1
* Between 1.5 and 2.5 (included), 2
* Between 2.5 and 3.5 (included), 3
* Greater than 3.5, 4

However, we can see that the results are not very good: as we might expect, the most common outcome is between 2 and 3. If we look at the misses, we can manually adjust the coefficients (for example, few instances that are actually a '3' are predicted as '4', but many that are actually a '4' are predicted as '3'; solution: instead of 3.5, the last coefficient could be smaller, for example 2.85), and this is what we have done in the second execution. **However: manually adjusting coefficients is not a good way to achieve better performance, we can overfit to this data.**

Thus, we can try the following code, which is extracted from this post: https://www.kaggle.com/c/petfinder-adoption-prediction/discussion/76107. This solution is based on the Nelder-Mead optimization: https://codesachin.wordpress.com/2016/01/16/nelder-mead-optimization/

In [None]:
from functools import partial
import scipy as sp

class OptimizedRounder:
    def _kappa_loss(self, coef, y, y_pred):
        y_pred = np.copy(y_pred)
        for i, pred in enumerate(y_pred):
            if pred < coef[0]:
                y_pred[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                y_pred[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                y_pred[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                y_pred[i] = 3
            else:
                y_pred[i] = 4

        ll = cohen_kappa_score(y, y_pred, weights='quadratic')
        return -ll

    def fit(self, y, y_pred):
        loss_partial = partial(self._kappa_loss, y=y, y_pred=y_pred)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coefficients = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')['x']

    def predict(self, y_pred):
        y_pred = np.copy(y_pred)
        for i, pred in enumerate(y_pred):
            if pred < self.coefficients[0]:
                y_pred[i] = 0
            elif pred >= self.coefficients[0] and pred < self.coefficients[1]:
                y_pred[i] = 1
            elif pred >= self.coefficients[1] and pred < self.coefficients[2]:
                y_pred[i] = 2
            elif pred >= self.coefficients[2] and pred < self.coefficients[3]:
                y_pred[i] = 3
            else:
                y_pred[i] = 4
        return y_pred

In [None]:
def evaluate_regression_model(model, cv, X, y, display_results=True,
                              display_plot=True):
    orig_model = model
    fit_times = []
    coefficients = []
    rmse_values = []
    accuracy_scores = []
    kappa_scores = []
    confusion_matrices = []
    
    # Variables for average classification report
    original_class = []
    predicted_class = []
    
    for train_index, test_index in cv.split(X, y):
        print(f"CV Iteration {len(fit_times)+1}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = clone(orig_model)
        start = time.time() 
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model.fit(X_train, y_train)
        
        # Fitting rounder:
        optR = OptimizedRounder()
        y_pred_train = model.predict(X_train)
        optR.fit(y_train, y_pred_train)
        coefficients.append(optR.coefficients)
        
        end = time.time()
        
        fit_times.append(end-start)
        
        y_pred = model.predict(X_test)
        
        rmse_values.append(mean_squared_error(y_test, y_pred, squared=False))
        y_pred = optR.predict(y_pred)
        
        original_class.extend(y_test)
        predicted_class.extend(y_pred)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        
        kappa_scores.append(cohen_kappa_score(y_test, y_pred, weights='quadratic'))
        
        confusion_matrices.append(confusion_matrix(y_test, y_pred, normalize='true'))
    
    coefficients = np.array(coefficients)
    
    if display_results:
        print("-----------------RESULTS-----------------")
        print(f"Mean fit time: {np.mean(fit_times)} s")
        print("RMSE:", rmse_values)
        print("Average RMSE:", np.mean(rmse_values))
        print("\nCoefficients:", coefficients)
        print("Average Coefficients:", np.mean(coefficients, axis=0))
        print("\nAccuracy:", accuracy_scores)
        print("QWK:", kappa_scores)
        print("\nAverage accuracy:", np.mean(accuracy_scores))
        print("Average QWK:", np.mean(kappa_scores))
        print("\nAverage classification report:")
        print(classification_report(original_class, predicted_class)) 
        
        if display_plot:
            disp = ConfusionMatrixDisplay(confusion_matrix=np.mean(confusion_matrices, axis=0))
            plt.style.use('default')
            nrows = 1
            ncols = 1
            figsize = (6,5)
            _, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
            ax.set_title("\nAverage confusion matrix", fontsize='16')
            disp.plot(ax=ax)
            ax.grid(False)

            plt.show()
    
    return np.mean(fit_times), np.mean(coefficients, axis=0), np.mean(rmse_values), np.mean(accuracy_scores), np.mean(kappa_scores)

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=seed,
                         seed=seed, n_jobs=-1)

_ = evaluate_regression_model(
    Pipeline(steps=pipeline_6_transformers + [('model', model)]),
    cv, X, y)

In [None]:
model = RandomForestRegressor(random_state=seed, n_jobs=-1)

_ = evaluate_regression_model(
    Pipeline(steps=pipeline_6_transformers + [('model', model)]),
    cv, X, y)

As we can see, this is a good way to convert a regression problem into an (ordinal) classification one. However, if we compare these results with the ones obtained with XGBClassifier and RandomForestClassifier, respectively, we can see that both classifiers give better average accuracy and also better f1-score for every class than the corresponding regressors, and since after the regressors we have to find the optimal coefficients (they are local, which is a disadvantage of the Nelder-Mead optimization), the average fit time increases considerably. It is true that the average QWK is better with the regressors, and this is the main metric, but if we do not have the competition 'mindset', there are sufficient reasons to continue working with classifiers for hyperparameter tuning.