In [3]:
#Baseline small CNN for project Milestone
import pandas as pd
import cv2
import os
import numpy as np
from tqdm import tqdm
import os
import gc
from glob import glob
from sklearn.metrics import fbeta_score
import sklearn.metrics 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import csv

# Keras libraries
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization

from keras.callbacks import Callback, EarlyStopping
from keras import backend

from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint
import h5py


In [4]:
df_train = pd.read_csv('/home/joerj/train_v2.csv')

In [5]:
# referred to https://www.kaggle.com/anokas/simple-keras-starter for help reading data and setting up basic Keras model
x = []
x_test = []
y = []


flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))
labels.sort()

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('/home/joerj/train-jpg/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x.append(cv2.resize(img, (32, 32)))
    y.append(targets)

100%|██████████| 40479/40479 [07:56<00:00, 84.87it/s]


In [7]:
split = 35000

x_train, x_valid, y_train, y_valid = x[:split], x[split:], y[:split], y[split:]

mean_image = np.mean(x_train, axis=0)
x_train -= mean_image
x_valid -= mean_image
x_train /= 128.
x_valid /= 128.

y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.float16)
y_valid = np.array(y_valid, np.uint8)
x_valid = np.array(x_valid, np.float16)


In [54]:
x = [1, 2, 3, 4, 5]
x[:4]

[1, 2, 3, 4]

In [8]:
#Create model class - model outline sourced from here: https://github.com/EKami/planet-amazon-deforestation

class LossHistory(Callback):
    def __init__(self):
        super().__init__()
        self.train_losses = []
        self.val_losses = []

    def on_epoch_end(self, epoch, logs={}):
        self.train_losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

class AmazonClassifier:
    def __init__(self):
        self.losses = []
        self.classifier = Sequential()
    def add_conv_layer_init(self, img_size=(32, 32), c=3, f = 32, p = .25):
        self.classifier.add(BatchNormalization(input_shape=(*img_size, c)))
        self.classifier.add(Conv2D(f, kernel_size=(3, 3),
                         padding = 'same',
                         activation='relu'))
        self.classifier.add(Conv2D(f, (3, 3), activation='relu', padding = 'same'))        
        self.classifier.add(MaxPooling2D(pool_size=(2, 2)))
        self.classifier.add(Dropout(p))
        
    def add_conv_layer_mid(self, img_size=(32, 32), c=3, f = 32, p = .25):
        self.classifier.add(Conv2D(f, kernel_size=(3, 3),
                         padding = 'same',
                         activation='relu'))
        self.classifier.add(Conv2D(f, (3, 3), activation='relu', padding = 'same' ))        
        self.classifier.add(MaxPooling2D(pool_size=(2, 2)))
        self.classifier.add(Dropout(p))

    def _get_fbeta_score(self, classifier, X_valid, y_valid):
        p_valid = classifier.predict(X_valid)
        return fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')

    def add_flatten_layer(self):
        self.classifier.add(Flatten())

    def add_dense_layer(self, output_size = 17, p = 0.5):
        self.classifier.add(Dense(512, activation='relu'))
        self.classifier.add(BatchNormalization())
        self.classifier.add(Dropout(0.5))
        self.classifier.add(Dense(output_size, activation='sigmoid'))
        
    def train_model(self, x_train, y_train, learn_rate=0.001, epoch=5, batch_size=128, validation_split_size=0.2, train_callbacks=()):
        history = LossHistory()

        X_train, X_valid, y_train, y_valid = train_test_split(x_train, y_train,
                                                              test_size=validation_split_size, random_state = 1234)

        opt = Adam(lr=learn_rate)

        self.classifier.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


        # early stopping will auto-stop training process if model stops learning after 3 epochs
        earlyStopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')

        self.classifier.fit(X_train, y_train,
                            batch_size=batch_size,
                            epochs=epoch,
                            verbose=1,
                            validation_data=(X_valid, y_valid),
                            callbacks=[history, *train_callbacks, earlyStopping])
        fbeta_score = self._get_fbeta_score(self.classifier, X_valid, y_valid)
        return [history.train_losses, history.val_losses, fbeta_score]
    
    def save_weights(self, weight_file_path):
        self.classifier.save_weights(weight_file_path)

    def load_weights(self, weight_file_path):
        self.classifier.load_weights(weight_file_path)

    def predict(self, x_test):
        predictions = self.classifier.predict(x_test)
        return predictions

    def map_predictions(self, predictions, labels_map, thresholds):
        """
        Return the predictions mapped to their labels
        :param predictions: the predictions from the predict() method
        :param labels_map: the map
        :param thresholds: The threshold of each class to be considered as existing or not existing
        :return: the predictions list mapped to their labels
        """
        predictions_labels = []
        for prediction in predictions:
            labels = [labels_map[i] for i, value in enumerate(prediction) if value > thresholds[i]]
            predictions_labels.append(labels)

        return predictions_labels

    def close(self):
        backend.clear_session()

In [9]:
#Grid search from ekami - works pretty well

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)


p_conv = 0.29592314649019363
p_all = 0.5619647177889551
scale = 1.134238146597395

#Use best parameters
batch_size = 128
validation_split_size = .20
classifier = AmazonClassifier()
classifier.add_conv_layer_init(f = 32, p = p_conv)
classifier.add_conv_layer_mid(f = 64, p = p_conv)
classifier.add_conv_layer_mid(f = 128, p = p_conv)
classifier.add_flatten_layer()
classifier.add_dense_layer(p = p_all)

train_losses, val_losses, scores_list = [], [], []

epochs_arr = [5, 10]
learn_rates = [0.001 * scale, 0.0001 * scale, 0.00001 * scale]

for learn_rate in learn_rates:
    for epochs in epochs_arr:
        tmp_train_losses, tmp_val_losses, score = classifier.train_model(x_train, y_train, learn_rate, epochs, 
                                                                               batch_size, validation_split_size=validation_split_size, 
                                                                               train_callbacks=[checkpoint])
        train_losses += tmp_train_losses
        val_losses += tmp_val_losses
        scores_list.append(score)

Train on 28000 samples, validate on 7000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 28000 samples, validate on 7000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 28000 samples, validate on 7000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 28000 samples, validate on 7000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Train on 28000 samples, validate on 7000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 28000 samples, validate on 7000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [10]:
#best: loss: 0.1026 - acc: 0.9598Epoch 00001: val_acc improved from 0.95800 to 0.9581
# best training: 0.9597
#save error over epochs 
np.savetxt("error_CNN.csv", np.vstack((val_losses, train_losses)), fmt='%.18e', delimiter=',')
np.savetxt("scores_CNN.csv", scores_list, fmt='%.18e', delimiter=',')

In [11]:
# Save model predictions for ensemble with CNN-8
p_valid = classifier.predict(x_valid)

#np.save("CNN_predict.npy", p_valid, allow_pickle=True, fix_imports=True)
#np.save("target_validation.npy", y_valid, allow_pickle=True, fix_imports=True)

In [24]:
score = fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')
predicted = np.array(p_valid) > 0.2

(5479, 17)

In [71]:
for i in range(46 ,50):
    if sum(y_valid[i,:] != predicted[i,:]) > 3:
        print(35000 + i, sum(y_valid[i,:] != predicted[i,:]))
        print(y_valid[i,:])
        print(predicted[i,:])
        print(labels)
        
#35000 is labelled agriculture, clear, cultivation, and primary
#labelled - agriculture, cultivation, habitation, partly cloudy, primary, road

35049 4
[1 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1]
[ True False False False False False False False  True  True False  True
  True  True False False False]
['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze', 'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water']


In [42]:
for i in range(17):
    print(labels[i], " & ", np.round_( np.sum(y_valid[:,i] == predicted[:,i]) / y_valid.shape[0], 3), "&", 
           np.round_( fbeta_score(y_valid[:, i], predicted[:,i], beta=2), 4), "\\\\")

agriculture  &  0.862 & 0.8827 \\
artisinal_mine  &  0.995 & 0.6904 \\
bare_ground  &  0.969 & 0.3053 \\
blooming  &  0.993 & 0.0633 \\
blow_down  &  0.998 & 0.0 \\
clear  &  0.939 & 0.975 \\
cloudy  &  0.976 & 0.8705 \\
conventional_mine  &  0.998 & 0.0 \\
cultivation  &  0.873 & 0.6543 \\
habitation  &  0.901 & 0.5886 \\
haze  &  0.951 & 0.781 \\
partly_cloudy  &  0.962 & 0.9293 \\
primary  &  0.964 & 0.9903 \\
road  &  0.86 & 0.7995 \\
selective_logging  &  0.991 & 0.1269 \\
slash_burn  &  0.993 & 0.0 \\
water  &  0.865 & 0.7144 \\


  'precision', 'predicted', average, warn_for)


In [51]:
#Predict on the test set 
# Implemenet best threshold selection? 
x = []
x_test = []
y = []
df_test = pd.read_csv('/home/joerj/sample_submission_v2.csv')

for f in tqdm(df_test.image_name, miniters=1000):
    if('test' in f):
        img = cv2.imread('/home/joerj/test-jpg/test-jpg/' + f + '.jpg')
    else: 
        img = cv2.imread('/home/joerj/test-jpg-additional/test-jpg-additional/' + f + '.jpg')
    x.append(cv2.resize(img, (32, 32)))

100%|██████████| 61191/61191 [08:17<00:00, 123.00it/s]


In [64]:
x_test = x[:30000] - mean_image
x_test /= 128
x_test = np.array(x_test, np.float16)
#Predict on the test set 
p_test = classifier.predict(x_test)
test_pred = np.array(p_test) > 0.2
test_pred = pd.DataFrame(test_pred, columns = labels)

preds = []
for i in tqdm(range(test_pred.shape[0]), miniters=1000):
    a = test_pred.ix[[i]]
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
100%|██████████| 30000/30000 [00:31<00:00, 939.88it/s]


In [65]:
x_test = x[30000:] - mean_image
x_test /= 128
x_test = np.array(x_test, np.float16)
#Predict on the test set 
p_test = classifier.predict(x_test)
test_pred = np.array(p_test) > 0.2
test_pred = pd.DataFrame(test_pred, columns = labels)

for i in tqdm(range(test_pred.shape[0]), miniters=1000):
    a = test_pred.ix[[i]]
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))
    

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
100%|██████████| 31191/31191 [00:32<00:00, 946.24it/s]


In [66]:
df_test = pd.read_csv('/home/joerj/sample_submission_v2.csv')
df_test['tags'] = preds
df_test.to_csv('submission.csv', index=False)

In [15]:
# Try to improve via Random hyperparameter search 
validation_split_size = 5000
num_experiments = 7

best_p_conv = -1
best_batch_size = -1
best_lr = -1 
best_batch = -1
best_s = -1
for i in range(num_experiments):         
    p_conv = np.random.uniform(low = 0.2, high = 0.3)
    p_all = np.random.uniform(low = 0.4, high = 0.6)
    batch_size = np.random.choice((64, 128))
    scale = np.random.uniform(low = 0.5, high = 1.5)
    learn_rates = [0.001 * scale, 0.0001 * scale, 0.00001 * scale]
    
    classifier = AmazonClassifier()
    classifier.add_conv_layer_init(f = 32, p = p_conv)
    classifier.add_conv_layer_mid(f = 64, p = p_conv)
    classifier.add_conv_layer_mid(f = 128, p = p_conv)
    classifier.add_flatten_layer()
    classifier.add_dense_layer(p = p_all)

    filepath="weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
    train_losses, val_losses, scores_list = [], [], []

    epochs_arr = [5, 10]

    for learn_rate in learn_rates:
        for epochs in epochs_arr:
            tmp_train_losses, tmp_val_losses, score = classifier.train_model(x_train, y_train, learn_rate, epochs, 
                                                                                   batch_size, validation_split_size=validation_split_size, 
                                                                                   train_callbacks=[checkpoint])
            train_losses += tmp_train_losses
            val_losses += tmp_val_losses
            scores_list.append(score)
    
    s = max(scores_list)
    if(s > best_s):
        best_p_conv = p_conv
        best_p_all = p_all        
        best_lr_scale = scale
        best_batch_size = batch_size
        best_s = s

Train on 30000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 30000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Train on 30000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 30000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Train on 30000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 30000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 30000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 30000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Train on 30000 samples

In [17]:
print(best_s)
print(best_p_conv)
print(best_p_all  )
print(best_lr_scale )
print(best_batch_size )


0.909633109922
0.29592314649019363
0.5619647177889551
1.134238146597395
128
