In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
import time
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [36]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
label = pd.read_pickle('label_full.pkl')

In [105]:
photos = []
folder = 'train_set/images/'
# enumerate files in the directory
for file in os.listdir(folder):
    photo = tf.keras.preprocessing.image.load_img(folder + file, target_size = (96,96))
    photo = tf.keras.preprocessing.image.img_to_array(photo)
    photos.append(photo)
# convert to numpy arrays
photos = np.asarray(photos)
labels = np.asarray(label)
print(photos.shape, labels.shape)
# save the reshaped photos
# save('resized_image.npy',photos)

(3000, 96, 96, 3) (3000,)


In [106]:
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(photos, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=66)

bool_train_labels = y_train != 0

# form np arrays of labels
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [126]:
# Create convolutional base
prior = keras.applications.VGG16(
    include_top=False, 
    weights='imagenet',
    input_shape=(96,96, 3)
)
model = Sequential()
model.add(prior)
model.add(layers.Conv2D(64, kernel_size=(3, 3), input_shape=(96, 96, 3), activation='relu', padding='same'))
model.add(layers.Conv2D(64, kernel_size=(3, 3), input_shape=(96, 96, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu', name='Dense_Intermediate'))
model.add(layers.Dropout(0.1, name='Dropout_Regularization'))
model.add(layers.Dense(1, activation='sigmoid', name='Output'))

model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 3, 3, 512)         14714688  
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 3, 3, 64)          294976    
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 1, 1, 64)          0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 64)                0         
_________________________________________________________________
Dense_Intermediate (Dense)   (None, 32)                2080      
_________________________________________________________________
Dropout_Regularization (Drop (None, 32)              

In [127]:
# Freeze the VGG16 model, e.g. do not train any of its weights.
# We will just use it as-is.
for cnn_block_layer in (model.layers)[:19]:
    cnn_block_layer.trainable = False


METRICS = [
           keras.metrics.BinaryAccuracy(name='accuracy'),
           keras.metrics.AUC(name='auc'),
           keras.metrics.Precision(name='precision'),
           keras.metrics.Recall(name='recall')
]

model.compile(
    optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics= METRICS
)

In [128]:
# Finally we fit the model. I use two callbacks here: EarlyStopping,
# which stops the model short of its full 20 epochs if validation 
# performance consistently gets worse; and ReduceLROnPlateau, which 
# reduces the learning rate 10x at a time when it detects model 
# performance is no longer improving between epochs.
#

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True)

# Recall that our dataset is highly imbalanced. We deal with this
# problem by generating class weights and passing them to the model
# at training time. The model will use the class weights to adjust
# how it trains so that each class is considered equally important to
# get right, even if the actual distribution of images is highly 
# variable.
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train) 
class_weight = {0: class_weights[0], 1: class_weights[1]}

# I found that a batch size of 128 offers the best trade-off between
# model training time and batch volatility.

batch_size = 128


In [129]:
# fit the model.

model.fit(X_train, y_train, 
          steps_per_epoch=len(X_train) // batch_size,
          validation_data=(X_val, y_val),
          epochs=20,
          validation_steps=len(X_val) // batch_size,
          class_weight=class_weight,
          callbacks=[early_stopping,
          tf.keras.callbacks.ReduceLROnPlateau(patience=2)]
    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f9e41473390>

In [130]:
# on test set
pred_bal = model.evaluate(X_test, y_test)
print('Loss on balanced test set is {:.2f}'.format(pred_bal[0]))
print('Accuracy on balanced test set is {:.2f}'.format(pred_bal[1]))
print('AUC on balanced test set is {:.2f}'.format(pred_bal[2]))

Loss on balanced test set is 2.09
Accuracy on balanced test set is 0.75
AUC on balanced test set is 0.53
