# Deep Learning: Dogs vs Cats Analysis

In [55]:
%matplotlib inline
import math
import numpy as np
import utils; reload(utils)
from utils import *

from keras.models import Sequential
from keras.layers import Lambda, Dense
from keras import backend as K
from matplotlib import pyplot as plt

## Workflow for each analysis type (e.g Default Parameter, Backpropagation...):

1. Create model
2. Train it with the default "Learning Rate" of 0.01 just 1 epoch so we see the speed with what the accuracy is increasing.
3. Increase the "Learning Rate" to 0.1 and train the model between 4 and 12 epochs.
4. Decrease the "Learning Rate" to 0.01 and train the model 4 epochs.
5. Decrease the "Learning Rate" to 0.001 and train the model 2 epochs.
6. ...

All of these steps are approximate. You should play around with the output to see how the accuracy is reacting.

# Run the following lines in order to set up the Enviroment

In [56]:
# We set the "seed" so we make the results a bit more predictable.
np.random.seed(1)

In [57]:
# Type 'sample/' if you want to work on a smaller dataset.
path = ''
# Depending on your GPU you should change this. For a GTX 970 this is a good value. 
batch_size = 64

In [58]:
# This is the timestamp that we are going to use when saving files.
timestamp = '010108012017'

In [59]:
# Define some useful paths to save files (e.g weights)
files_path = path + 'files/'
models_path = path + 'models/'

In [60]:
def load_batches(path, augmentation=False):
    """
    Load different batches that we'll use in our calculations.
    """
    if augmentation:
        gen = image.ImageDataGenerator(rotation_range=8, width_shift_range=0.8, shear_range=0.3,
                                       height_shift_range=0.8, zoom_range=0.8)
    else:
        gen = image.ImageDataGenerator()
    train_batches = gen.flow_from_directory(path + 'train', target_size=(224,224),
                    class_mode='categorical', shuffle=True, batch_size=batch_size)
    val_batches = gen.flow_from_directory(path + 'valid', target_size=(224,224),
                    class_mode='categorical', shuffle=True, batch_size=batch_size)
    test_batches = gen.flow_from_directory(path + 'test', target_size=(224,224),
                    class_mode='categorical', shuffle=True, batch_size=batch_size)
    return train_batches, val_batches, test_batches

In [61]:
def finetune(model):
    """
    Removes the last layer (usually Dense) and replace it by another one more fitting.
    This is useful when using a pre-trained model like VGG.
    """
    model.pop()
    for layer in model.layers: layer.trainable=False
    model.add(Dense(train_batches.nb_class, activation='softmax'))

In [62]:
def backpropagation(model):
    """
    Now we do Backpropagation. Backpropagation is when we want to train not only the last
    Dense layer, but also some previous ones. Note that we don't train Convolutional layers.
    """
    layers = model.layers
    for layer in layers: layer.trainable=False
    # Get the index of the first dense layer...
    first_dense_idx = [index for index,layer in enumerate(layers) if type(layer) is Dense][0]
    # ...and set this and all subsequent layers to trainable
    for layer in layers[first_dense_idx:]: layer.trainable=True

In [63]:
def save_weights(model, path, name, timestamp):
    print 'Saving weights: {}.h5'.format(path + name + '_' + timestamp)
    model.save_weights(path + '{}_{}.h5'.format(name, timestamp))

In [64]:
def load_weights(model, path, name, timestamp):
    print 'Loading weights: {}.h5'.format(path + name + '_' + timestamp)
    model.load_weights(path + '{}_{}.h5'.format(name, timestamp))

In [65]:
def train_model(model, train_batches, val_batches, rules, name, timestamp, optimizer_algorithm=Adam):
    """
    Rules will be something like:
        (
            (0.01, 3),
            (0.1, 2),
            ...
        )
    """
    for lr, epochs in rules:
        model.compile(optimizer=optimizer_algorithm(lr=lr),
              loss='categorical_crossentropy', metrics=['accuracy'])

        for i in range(epochs):
            print 'Lr: {}, Epoch: {}'.format(lr, i + 1)
            model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                                nb_epoch=1, validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
            save_weights(model, files_path, '{}_lr{}_epoch{}'.format(
                    name, lr, i+1), timestamp)

In [66]:
def split_conv_fc(model):
    """
    Split Convolutional and Dense Layers.
    """
    layers = model.layers
    last_conv_idx = [index for index,layer in enumerate(layers) 
                     if type(layer) is Convolution2D][-1]
    conv_layers = layers[:last_conv_idx+1]
    fc_layers = layers[last_conv_idx+1:]
    return conv_layers, fc_layers

# Default Parameters (VGG16)

In [67]:
name = 'default_parameter_vgg16'

In [68]:
vgg = Vgg16()

In [69]:
train_batches, val_batches, test_batches = load_batches(path)

Found 23000 images belonging to 2 classes.
Found 2000 images belonging to 2 classes.
Found 12500 images belonging to 1 classes.


In [70]:
finetune(vgg.model)

In [None]:
train_model(vgg.model, train_batches, val_batches, ((0.01, 2), (0.1, 1), (0.001, 4), (0.0001, 2)), name + '_lastlayer', timestamp)

In [None]:
save_weights(vgg.model, files_path, name, timestamp)

In [None]:
load_weights(vgg.model, files_path, name, timestamp)

# Data Augmentation (VGG16)

In [None]:
name = 'data_augmentation_vgg16'

In [None]:
vgg = Vgg16()

In [None]:
train_batches, val_batches, test_batches = load_batches(path, augmentation=True)

In [None]:
finetune(vgg.model)

In [None]:
train_model(vgg.model, train_batches, val_batches, ((0.01, 1), (0.1, 1), (0.001, 1), (0.0001, 1)), name + '_lastlayer', timestamp)

In [None]:
save_weights(vgg.model, files_path, name, timestamp)

In [None]:
load_weights(vgg.model, files_path, name, timestamp)

# Backpropagation - Only Dense Layers (VGG16)

In [None]:
name = 'backpropagation_vgg16'

In [None]:
vgg = Vgg16()

In [None]:
train_batches, val_batches, test_batches = load_batches(path)

In [None]:
finetune(vgg.model)

In [None]:
# We first need to train the last layer
train_model(vgg.model, train_batches, val_batches, ((0.01, 1)), name + '_lastlayer', timestamp)

In [None]:
backpropagation(vgg.model)

In [None]:
# For the Backpropagation the Learning rate should be quite small
train_model(vgg.model, train_batches, val_batches, ((0.001, 1), (0.0001, 1)), name + '_denselayers', timestamp)

In [None]:
save_weights(vgg.model, files_path, name, timestamp)

In [None]:
load_weights(vgg.model, files_path, name, timestamp)

# Data Augmentation + Backpropagation (VGG16)

In [None]:
name = 'data_augmentation_backpropagation_vgg16'

In [None]:
vgg = Vgg16()

In [None]:
train_batches, val_batches, test_batches = load_batches(path, augmentation=True)

In [None]:
finetune(vgg.model)

In [None]:
train_model(vgg.model, train_batches, val_batches, ((0.01, 1),), name + '_lastlayer', timestamp)

In [None]:
backpropagation(vgg.model)

In [None]:
train_model(vgg.model, train_batches, val_batches, ((0.01, 1), (0.1, 1), (0.001, 1), (0.0001, 1)), name + '_denselayers', timestamp)

In [None]:
save_weights(vgg.model, files_path, name, timestamp)

In [None]:
load_weights(vgg.model, files_path, name, timestamp)

# Remove Dropout (VGG16)

In [None]:
name = 'remove_dropout_vgg16'

In [None]:
vgg = Vgg16()

In [None]:
finetune(vgg.model)

In [None]:
load_weights(vgg.model, files_path, 'default_parameter_vgg16', timestamp)

In [None]:
conv_layers, fc_layers = split_conv_fc(vgg.model)

In [None]:
conv_model = Sequential(conv_layers)

In [None]:
train_batches, val_batches, test_batches = load_batches(path)

In [47]:
train_features = conv_model.predict_generator(train_batches, train_batches.nb_sample)
val_features = conv_model.predict_generator(val_batches, val_batches.nb_sample)
val_classes = val_batches.classes
train_classes = train_batches.classes
val_labels = onehot(val_classes)
train_labels = onehot(train_classes)

In [48]:
save_array(models_path + 'train_convlayer_features.bc', train_features)
save_array(models_path + 'valid_convlayer_features.bc', val_features)

In [49]:
train_features = load_array(models_path+'train_convlayer_features.bc')
val_features = load_array(models_path+'valid_convlayer_features.bc')

In [50]:
# Copy the weights from the pre-trained model.
# NB: Since we're removing dropout, we want to half the weights
def proc_wgts(layer): return [o/2 for o in layer.get_weights()]

In [51]:
def get_fc_model():
    model = Sequential([
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation='relu'),
        Dropout(0.),
        Dense(4096, activation='relu'),
        Dropout(0.),
        Dense(2, activation='softmax')
        ])

    for l1,l2 in zip(model.layers, fc_layers): l1.set_weights(proc_wgts(l2))

    
    model.compile(optimizer=RMSprop(lr=0.00001, rho=0.7), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [52]:
model = get_fc_model()
model.fit(train_features, train_labels, nb_epoch=8, 
          batch_size=batch_size, validation_data=(val_features, val_labels))

Train on 40 samples, validate on 20 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fa0e3644050>

In [None]:
save_weights(vgg.model, files_path, name, timestamp)

In [None]:
load_weights(vgg.model, files_path, name, timestamp)

# Viewing model prediction examples

* A few correct labels at random
* A few incorrect labels at random
* The most correct labels of each class (ie those with highest probability that are correct)
* The most incorrect labels of each class (ie those with highest probability that are incorrect)
* The most uncertain labels (ie those with probability closest to 0.5).

In [None]:
val_batches, probs = vgg.test(path + 'valid', batch_size = batch_size)

filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)

In [None]:
from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(path + 'valid/' + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [None]:
#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

In [None]:
#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

In [None]:
#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

In [None]:
#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

In [None]:
#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

In [None]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

# Confussion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

# Predict Test set + create Kaggle submission file

In [None]:
predictions = vgg.model.predict_generator(test_batches, test_batches.nb_sample)

In [None]:
isdog = predictions[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])

In [None]:
isdog = isdog.clip(min=0.05, max=0.95)

In [None]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = test_batches.filenames

ids = np.array([int(f[8:f.find('.')]) for f in filenames])

In [None]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [None]:
submission_file_name = 'submission_{}_5.csv'.format(timestamp)
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
from IPython.display import FileLink
FileLink(submission_file_name)

# Alternative way to generate Submission file (it has a better score!)

In [None]:
def write_submission_csv(submission_file_name, data, columns):
    """
    Write data according to the Kaggle submission format.
    """
    with open(submission_file_name, 'wb') as f:
        w = csv.writer(f)
        w.writerow(columns)
        for key in data.keys():
            w.writerow([key, data[key]])

In [None]:
test_batches, predictions = vgg.test(path+'test', batch_size=batch_size)

In [None]:
import csv
d = {}
submission_file_name = 'submission_{}_5.csv'.format(timestamp)
for idx, filename in enumerate(test_batches.filenames):
    # We only want the ID, so remove the folder name and file extension.
    result = int(filename[8:-4])
    # We use a trick to never show 0 or 1, but 0.05 and 0.95.
    # This is required becase log loss penalizes predictions that are confident and wrong.
    d[result] = predictions[idx][1].clip(min=0.05, max=0.95)
write_submission_csv(submission_file_name, d, ['id', 'label'])

In [None]:
from IPython.display import FileLink
FileLink(submission_file_name)