In [38]:
#Baseline small CNN for project Milestone
import pandas as pd
import cv2
import os
import numpy as np
from tqdm import tqdm
import os
import gc
from glob import glob
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Keras libraries
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization

from keras.callbacks import Callback, EarlyStopping
from keras import backend

from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint
import h5py


In [39]:
df_train = pd.read_csv('train_v2.csv')

In [43]:
# referred to https://www.kaggle.com/anokas/simple-keras-starter for help reading data and setting up basic Keras model
x = []
x_test = []
y = []


flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('/home/joerj/train-jpg/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x.append(cv2.resize(img, (40, 40)))
    y.append(targets)

100%|██████████| 40479/40479 [03:43<00:00, 180.76it/s]


In [41]:
#Flip images - but need to keep seperate from validation and training 
#for i in range(len(x)):
#    x.append(np.fliplr(x[i]))

In [44]:
y_train = np.array(y, np.uint8)
x_train = np.array(x, np.float16) / 255.

In [45]:
#Create model class - model outline sourced from here: https://github.com/EKami/planet-amazon-deforestation

class LossHistory(Callback):
    def __init__(self):
        super().__init__()
        self.train_losses = []
        self.val_losses = []

    def on_epoch_end(self, epoch, logs={}):
        self.train_losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

class AmazonClassifier:
    def __init__(self):
        self.losses = []
        self.classifier = Sequential()
    def add_conv_layer_init(self, img_size=(32, 32), c=3, f = 32, p = .25):
        self.classifier.add(BatchNormalization(input_shape=(*img_size, c)))
        self.classifier.add(Conv2D(f, kernel_size=(3, 3),
                         padding = 'same',
                         activation='relu'))
        self.classifier.add(Conv2D(f, (3, 3), activation='relu'))        
        self.classifier.add(MaxPooling2D(pool_size=(2, 2)))
        self.classifier.add(Dropout(p))
        
    def add_conv_layer_mid(self, img_size=(32, 32), c=3, f = 32, p = .25):
        self.classifier.add(Conv2D(f, kernel_size=(3, 3),
                         padding = 'same',
                         activation='relu'))
        self.classifier.add(Conv2D(f, (3, 3), activation='relu'))        
        self.classifier.add(MaxPooling2D(pool_size=(2, 2)))
        self.classifier.add(Dropout(p))

    def _get_fbeta_score(self, classifier, X_valid, y_valid):
        p_valid = classifier.predict(X_valid)
        return fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')

    def add_flatten_layer(self):
        self.classifier.add(Flatten())

    def add_dense_layer(self, output_size = 17, p = 0.5):
        self.classifier.add(Dense(512, activation='relu'))
        self.classifier.add(BatchNormalization())
        self.classifier.add(Dropout(0.5))
        self.classifier.add(Dense(output_size, activation='sigmoid'))
        
    def train_model(self, x_train, y_train, learn_rate=0.001, epoch=5, batch_size=128, validation_split_size=0.2, train_callbacks=()):
        history = LossHistory()

        X_train, X_valid, y_train, y_valid = train_test_split(x_train, y_train,
                                                              test_size=validation_split_size)

        opt = Adam(lr=learn_rate)

        self.classifier.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


        # early stopping will auto-stop training process if model stops learning after 3 epochs
        earlyStopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')

        self.classifier.fit(X_train, y_train,
                            batch_size=batch_size,
                            epochs=epoch,
                            verbose=1,
                            validation_data=(X_valid, y_valid),
                            callbacks=[history, *train_callbacks, earlyStopping])
        fbeta_score = self._get_fbeta_score(self.classifier, X_valid, y_valid)
        return [history.train_losses, history.val_losses, fbeta_score]
    
    def save_weights(self, weight_file_path):
        self.classifier.save_weights(weight_file_path)

    def load_weights(self, weight_file_path):
        self.classifier.load_weights(weight_file_path)

    def predict(self, x_test):
        predictions = self.classifier.predict(x_test)
        return predictions

    def map_predictions(self, predictions, labels_map, thresholds):
        """
        Return the predictions mapped to their labels
        :param predictions: the predictions from the predict() method
        :param labels_map: the map
        :param thresholds: The threshold of each class to be considered as existing or not existing
        :return: the predictions list mapped to their labels
        """
        predictions_labels = []
        for prediction in predictions:
            labels = [labels_map[i] for i, value in enumerate(prediction) if value > thresholds[i]]
            predictions_labels.append(labels)

        return predictions_labels

    def close(self):
        backend.clear_session()

In [46]:
#Grid search from ekami - works pretty well

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)

batch_size = 16
validation_split_size = 5000
classifier = AmazonClassifier()
classifier.add_conv_layer_init(img_size=(40, 40))
classifier.add_conv_layer_mid(f = 64)
classifier.add_conv_layer_mid(f = 128)
classifier.add_flatten_layer()
classifier.add_dense_layer()

train_losses, val_losses, scores_list = [], [], []

epochs_arr = [5, 10]
learn_rates = [0.001, 0.0001, 0.00001]
learn_rates = [0.00001]
for learn_rate in learn_rates:
    for epochs in epochs_arr:
        tmp_train_losses, tmp_val_losses, score = classifier.train_model(x_train, y_train, learn_rate, epochs, 
                                                                               batch_size, validation_split_size=validation_split_size, 
                                                                               train_callbacks=[checkpoint])
        train_losses += tmp_train_losses
        val_losses += tmp_val_losses
        scores_list.append(score)

Train on 35479 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 35479 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 35479 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 35479 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Train on 35479 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 35479 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [47]:
scores_list

[0.88234807134939619,
 0.90041064609876198,
 0.91140191131703252,
 0.91251613410072496,
 0.91906292405795087,
 0.91561545143151457]

In [12]:
#Predict on the test set 
# Implemenet best threshold selection? 
x_test, x_test_filename = data_helper.preprocess_test_data(test_jpeg_dir, img_resize)
x_test = []

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('/home/joerj/test-jpg/test-jpg/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (32, 32)))


[0.88278592403758316,
 0.89380226479423941,
 0.91335860719395501,
 0.91403794224670998,
 0.91967140438118244,
 0.91611982708544204]

In [14]:
# Try to improve via Random hyperparameter search 
validation_split_size = 5000
num_experiments = 3

for i in range(num_experiments):         
    p_conv = np.random.uniform(low = 0.2, high = 0.3)
    p_all = np.random.uniform(low = 0.4, high = 0.6)
    batch_size = np.random.choice((64, 128))
    
    classifier = AmazonClassifier()
    classifier.add_conv_layer_init(f = 32, p = p_conv)
    classifier.add_conv_layer_mid(f = 64, p = p_conv)
    classifier.add_conv_layer_mid(f = 128, p = p_conv)
    classifier.add_flatten_layer()
    classifier.add_dense_layer(p = p_all)

    train_losses, val_losses, scores_list = [], [], []

    epochs = np.random.choice((5, 10))
    learn_rate = 10**(np.random.uniform(low = -5, high = -3))

    print(p_conv, p_all, batch_size, epochs, learn_rate)
    tmp_train_losses, tmp_val_losses, score = classifier.train_model(x_train, y_train, learn_rate, epochs, 
                                                                           batch_size, validation_split_size=validation_split_size, 
                                                                           train_callbacks=[checkpoint])
    train_losses += tmp_train_losses
    val_losses += tmp_val_losses
    scores_list.append(score)

0.21665956302257908 0.592042238360719 128 5 1.9527295540425954e-06
Train on 35479 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.29499934314735166 0.5882289849203507 64 10 9.520549465625149e-05
Train on 35479 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.20420354768817378 0.5333605950445897 64 10 6.586971472684133e-06
Train on 35479 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
