In [1]:
import pandas as pd
from glob import glob
import os
from PIL import Image
import numpy as np
import random
import time
from tqdm.notebook import tqdm

In [2]:
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics import log_loss
from sklearn import preprocessing

random.seed(34)

In [3]:
# Create model based on ResNet50
model = ResNet50(weights='imagenet')

In [4]:
def read_dataset(path, sample_size, spliting):
    X = []
    y = []   
    
    image_paths_list = glob(os.path.join(path, 'train', '*.jpg'))
    
    for i in tqdm(range(spliting)):  
        image_paths_sample = random.sample(image_paths_list, sample_size)     
        image_paths_list = list(set(image_paths_list) - set(image_paths_sample))          
        for image_path in image_paths_sample:
            image_name = os.path.basename(image_path)
            image_name_parts = image_name.split('.')
            label = image_name_parts[0] if len(image_name_parts) == 3 else None

            if label:
                y.append(int(label == 'cat'))

            x = image.img_to_array(image.load_img(image_path, target_size=(224, 224)))
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)

            resnet_pred_features = model.predict(x)
            X.append(resnet_pred_features.flatten())
               
    return np.array(X), y

In [5]:
# Get Data
features, target = read_dataset("./data/", 1000, 25)
# Quantity of images
print(len(features), len(target))

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


25000 25000


In [6]:
# Divide the sample into train and test with 70/30 ratio
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, train_size = 0.7)

print(np.sum(y_train) / len(y_train), np.sum(y_test) / len(y_test))
len(x_train), len(y_train), len(x_test), len(y_test)

0.4975428571428571 0.5057333333333334


(17500, 17500, 7500, 7500)

In [7]:
# Create XGBClassifier with default parameters
from xgboost import XGBClassifier

clf = XGBClassifier(use_label_encoder=False, verbosity=0)
clf.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=0)

In [8]:
# Get predictions on the Training and Testing set
train_pred = clf.predict(x_train)
test_pred = clf.predict(x_test)

In [9]:
# Evaluating the results
train_score = accuracy_score(y_train, train_pred)
test_score = accuracy_score(y_test, test_pred)
print('Train score :', train_score, 'Test score :', test_score)

Train score : 1.0 Test score : 0.9854666666666667


In [10]:
parameters = {
    "n_estimators": 65,  
    "learning_rate": 0.35,
}

clf = XGBClassifier(**parameters, use_label_encoder=False)
clf.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.35, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=65, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [11]:
from sklearn.model_selection import cross_val_score
cross_score = cross_val_score(clf, x_test, y_test, cv=5)
print(cross_score)
print('Avg is: ', cross_score.mean())

[0.98466667 0.982      0.98733333 0.98133333 0.98466667]
Avg is:  0.984


In [12]:
# Making tuning of hyperparameters for XGBoost:
from hyperopt import hp
from hyperopt import fmin, tpe

X = x_train
y = y_train

def xgb_score(params):
    clf = XGBClassifier(**params, use_label_encoder=False)
    # averaging over 3 folds, to reduce the effect of stachosticity
    current_score = cross_val_score(clf, X, y, cv=3).mean()
    #print(current_score, params)
    return -current_score
 

space_params = {
            #constant params   
            'n_estimators': 65,  
            'learning_rate': 0.35,
   
            'booster': 'gbtree',             
            'objective': 'binary:logistic',
            'tree_method': 'auto',
            'verbosity': None, 
            'subsample': 1,
    
            #changable                     
            'colsample_bytree': hp.quniform('colsample_bytree', 0.05, 1, 0.05),
            'colsample_bylevel': hp.quniform('colsample_bylevel', 0.05, 1, 0.05),
            'colsample_bynode': hp.quniform('colsample_bynode', 0.05, 1, 0.05),   
            'gamma': hp.quniform('gamma', 0.5, 1, 0.05),    
            'max_delta_step':  hp.choice('max_delta_step', np.arange(1, 10, dtype=int)),
            'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
            'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, dtype=int)), 
}

 
best = fmin(xgb_score, space_params, tpe.suggest, 50)
print('best:', best)

100%|███████████████████████████████████████████████| 50/50 [11:37<00:00, 13.95s/trial, best loss: -0.9861142170560978]
best: {'colsample_bylevel': 0.5, 'colsample_bynode': 0.75, 'colsample_bytree': 0.35000000000000003, 'gamma': 0.65, 'max_delta_step': 2, 'max_depth': 11, 'min_child_weight': 4}


In [13]:
const_params = {
            #constant params   
            'n_estimators': 65,  
            'learning_rate': 0.35,
    
            'booster': 'gbtree',             
            'objective': 'binary:logistic',
            'tree_method': 'auto',
            'verbosity': None, 
            'subsample': 1,
}


clf = XGBClassifier(**best, **const_params, use_label_encoder=False)
clf.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=0.75, colsample_bytree=0.35000000000000003,
              gamma=0.65, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.35, max_delta_step=2,
              max_depth=11, min_child_weight=4, missing=nan,
              monotone_constraints='()', n_estimators=65, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [14]:
# After tuning
cross_score_after_tuning = cross_val_score(clf, x_test, y_test, cv=5)
print(cross_score_after_tuning)
print('Avg is: ', cross_score_after_tuning.mean())

[0.98533333 0.982      0.98866667 0.97866667 0.98466667]
Avg is:  0.9838666666666667


In [15]:
# Feature Importances
with open("./data/features_names.txt", "r") as file:
    contents = file.readlines()

for i in range(len(contents)):
    contents[i] = contents[i].strip().replace('\n', '').replace('\r', '')
    
important_feature = list(zip(contents, clf.feature_importances_))
important_feature.sort(key=lambda i: i[1], reverse=True)

In [16]:
# Top 20
for i in range(20):
    print(important_feature[i])

('lynx, catamount', 0.12864493)
('tabby, tabby cat', 0.07782079)
('plastic bag', 0.06644847)
('radiator', 0.039156254)
('Egyptian cat', 0.037676707)
('snow leopard, ounce, Panthera uncia', 0.03181163)
('Siamese cat, Siamese', 0.022429878)
('Greater Swiss Mountain dog', 0.022037087)
('Labrador retriever', 0.021235107)
('Irish wolfhound', 0.020156775)
('hamper', 0.01725069)
('bloodhound, sleuthhound', 0.013627452)
('boxer', 0.011964951)
('Persian cat', 0.010890979)
('Irish terrier', 0.0107342275)
('basenji', 0.010054439)
('Border terrier', 0.0080370195)
('carton', 0.0077191396)
('skunk, polecat, wood pussy', 0.0066036815)
('Gila monster, Heloderma suspectum', 0.006286415)
