# Statefarm Kaggle submission (fast.ai homework3)
What I'll need to do:
* set up data structure into sample, train, valid, test
* Import VGG16
* pop the top layer, train it
* set all fully connected layers to trainable
* Improvements:
    * play with dropout parameter
    * add data augmentation
    * stack multiple versions of the classifier
    * apply batch norm
    * have a setup that adjusts learning rate
    
These are general imports, always make sure to run these

In [1]:
import os
import zipfile
import shutil
import csv
import bcolz
os.environ["KERAS_BACKEND"] = "theano"
import keras
import numpy as np
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical

model_url = "http://files.fast.ai/models/"
model_name = "vgg16.h5"
cache_dir = "models"

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


# Data structure
First we set up the data structure, with proper:
* sample (about 10% of the data) 
* train
* validation (about 15% of the data)
* test 
directories in the `processed` directory

In [None]:
raw_path = os.path.join(os.getcwd(), os.pardir, 'data', 'raw')
processed_path = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')

# Make directories sample, valid, train, test, first check if this whole step is necessary
if os.path.exists(os.path.join(processed_path, 'sample')):
    print 'Sample directory already exists, no need to do data structuring!'
else:
    os.mkdir(os.path.join(processed_path, 'sample'))
    os.mkdir(os.path.join(processed_path, 'sample', 'train'))
    os.mkdir(os.path.join(processed_path, 'sample', 'valid'))
    os.mkdir(os.path.join(processed_path, 'valid'))
    
    # Extract Kaggle zipfiles to correct path
    print 'Extracting zips, this may take a while...'
    img_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'imgs.zip'), 'r')
    img_zip_handle.extractall(processed_path)
    img_zip_handle.close()
    
    csv_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'driver_imgs_list.csv.zip'), 'r')
    csv_zip_handle.extractall(processed_path)
    csv_zip_handle.close()
    print 'Done extracting zips!'
    
    # Set up sample directory structure
    for i in range(10):
        dirname = 'c' + str(i)
        os.mkdir(os.path.join(processed_path, 'sample', 'train', dirname))
        os.mkdir(os.path.join(processed_path, 'sample', 'valid', dirname))
        os.mkdir(os.path.join(processed_path, 'valid', dirname))
        
    data = np.genfromtxt(os.path.join(processed_path, 'driver_imgs_list.csv'), delimiter=',', dtype=None)
    data = data[1:,:]
    drivers = np.unique(data[:,0])
    num_drivers = drivers.shape[0]
    # Throw 15% of train data into sample folder
    sample_drivers_amount = int(np.floor(num_drivers*0.15))
    sample_drivers = np.random.choice(drivers, sample_drivers_amount, replace=False)

    # Throw 20% of train data into valid folder
    validation_drivers_amount = int(np.floor(num_drivers*0.2))
    validation_drivers = np.random.choice(drivers, validation_drivers_amount, replace=False)

    # Set up sample set
    for i in range(sample_drivers_amount):
        driver_name = sample_drivers[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            dest = os.path.join(processed_path, 'sample', 'train', driver_class)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                shutil.copyfile(src, os.path.join(dest, filename))

    # Now move from sample_train to sample_validation a fraction of ~40%
    sample_drivers_validation_amount = int(np.floor(sample_drivers_amount*0.4))
    sample_drivers_validation = np.random.choice(sample_drivers, 
                                                 sample_drivers_validation_amount, 
                                                 replace=False)

    for i in range(sample_drivers_validation_amount):
        driver_name = sample_drivers_validation[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                dest = os.path.join(processed_path, 'sample', 'valid', driver_class, filename)
                src = os.path.join(processed_path, 'sample', 'train', driver_class, filename)
                shutil.move(src, dest)

    # Set up validation set
    for i in range(validation_drivers_amount):
        driver_name = validation_drivers[i]
        driver_columns = data[data[:,0] == driver_name]

        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                dest = os.path.join(processed_path, 'valid', driver_class, filename)
                shutil.move(src, dest)

# VGG16() setup boilerplate

In [2]:
def add_conv_block(model, layers, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    return model
    
def add_fc_block(model, dropout):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout))
    return model

In [3]:
class vgg16():
    def __init__(self, dropout=0.5):
        self.vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape([3,1,1])
        self.create(dropout)
        
    def create(self, dropout):
        def vgg_preprocess(x, mean):
            mean = np.array(mean)
            x = x - mean
            return x[:,:,::-1]
        
        model = self.model = Sequential()
        
        model.add(Lambda(vgg_preprocess, 
                         input_shape=(3, 244, 244), 
                         output_shape=(3, 244, 244),
                         arguments = {'mean': self.vgg_mean.tolist()}
                        ))
        
        model = add_conv_block(model, 2, 64)
        model = add_conv_block(model, 2, 128)
        model = add_conv_block(model, 3, 256)
        model = add_conv_block(model, 3, 512)
        model = add_conv_block(model, 3, 512)
        
        model.add(Flatten())
        
        model = add_fc_block(model, dropout)
        model = add_fc_block(model, dropout)
        model.add(Dense(1000, activation='softmax'))
        
        model = model.load_weights(get_file(model_name, model_url+model_name, cache_subdir=cache_dir))

# Load in data with generators
Here I set up the generators for the training and validation work

In [4]:
DEBUG = True
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')
train_batches = ImageDataGenerator().flow_from_directory(train_path, 
                                                         target_size=(244,244), 
                                                         batch_size=batch_size, 
                                                         shuffle=True)
val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=True)

Found 1419 images belonging to 10 classes.
Found 651 images belonging to 10 classes.


# Finetuning the model
* Now the top layer must be popped and replaced with a 10-output, which will correspond to our hot-encoding/softmax output
* Then retrain model with new dense layer, which will be a good starting point for later fine tuning
* Save the model, so that we can start toying with it in the next section

In [5]:
lr = 0.001

model = vgg16(dropout=0.5).model
model.pop()
for layer in model.layers: layer.trainable=False
model.add(Dense(10, activation='softmax'))
model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(train_batches, 
                    samples_per_epoch=train_batches.nb_sample, 
                    nb_epoch=epochs, 
                    validation_data=val_batches, 
                    nb_val_samples=val_batches.nb_sample)

model.save(os.path.join(model_dir, 'model_with_new_top.h5'))

Epoch 1/2
Epoch 2/2


# New model architecture
Now that we have the trained model, we should probably make all the FC layers trainable. Additionally, we can start playing with:
* learning rate schedule
* batchnorm
* data augmentation
* setting different epochs
* some other kind of regularisation?

First, import the model from when we saved it. Then:
* Separate convolutional layers from fully connected ones
* Make a new convolutional architecture with whatever we want to implement
* Put them together
* Train

In [7]:
old_model = load_model(os.path.join(os.getcwd(), 
                                    os.pardir, 
                                    'models', 
                                    'model_with_new_top.h5'))

## Batch normalisation
Let's implement batch normalisation first. It'll speed up our looking for the adequate learning rate. From [this link](https://github.com/fchollet/keras/issues/1802) we know that `BatchNorm()` needs to be applied after the activation.

In [8]:
flatten_index = [index for index,layer in enumerate(old_model.layers) if type(layer).__name__ == 'Flatten'][0]

conv_model_layers = old_model.layers[1:flatten_index-1]
conv_model = Sequential(conv_model_layers)

def fc_model(dropout):
    model = Sequential()

    model.add(MaxPooling2D(input_shape=conv_model.layers[-1].output_shape[1:]))
#     model.layers[-1].name='maxpool_appended' # Shim because otherwise it would throw a dupe name error for no clear reason
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(10, activation='softmax'))
    return model


## Data augmentation
Let's set up new batch generators, this time making use of augmented data. Remember, we only seek to augment our **training** input, no need to augment validation input (there's no learning taking place).
`train_batches` generator is set to `False` because we're going to be saving it, and need reproducible inputs.

### Some minor debug settings

In [28]:
DEBUG = False
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

### Conv stack output
Using only the convolutional part of VGG16, I generate the predictions, based on some augmented data, and save it to disk.

In [None]:
train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')

train_image_gen = ImageDataGenerator(rotation_range=15,
                                     height_shift_range=0.05,
                                     width_shift_range=0.1,
                                     shear_range = 0.1,
                                     channel_shift_range=20,
                                    )

aug_train_batches = train_image_gen.flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)

train_batches = ImageDataGenerator().flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)

val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=False)

print 'Predicting, this may take a while...'
conv_model_predictions_augmented = conv_model.predict_generator(aug_train_batches,
                                                 aug_train_batches.nb_sample*2)
conv_model_predictions = conv_model.predict_generator(train_batches,
                                                 train_batches.nb_sample)
val_predictions = conv_model.predict_generator(val_batches,
                                               val_batches.nb_sample)

print 'Done predicting!'
# Concatenating augmented and non-augmented predictions
conv_model_predictions = np.concatenate([conv_model_predictions_augmented, conv_model_predictions])

prediction_labels = to_categorical(train_batches.classes)

prediction_labels = np.concatenate([prediction_labels]*3)

Found 18873 images belonging to 10 classes.
Found 18873 images belonging to 10 classes.
Found 3551 images belonging to 10 classes.
Predicting, this may take a while...


### Save everything to disk
Saving everything to disk so I don't need to generate it every time

In [22]:
def save_array(location, array):
    instance = bcolz.carray(array, rootdir=location, mode='w')
    instance.flush()
    
def load_array(location):
    return bcolz.open(location)[:]

In [23]:
save_array(os.path.join(model_dir, 'conv_predictions.bc'), conv_model_predictions)
save_array(os.path.join(model_dir, 'conv_labels.bc'), prediction_labels)
save_array(os.path.join(model_dir, 'val_predictions.bc'), val_predictions)
save_array(os.path.join(model_dir, 'val_labels.bc'), to_categorical(val_batches.classes))

## Train fully connected layers only

### Import data from disk

In [24]:
conv_predictions = load_array(os.path.join(model_dir, 'conv_predictions.bc'))
conv_labels = load_array(os.path.join(model_dir, 'conv_labels.bc'))
conv_val_predictions = load_array(os.path.join(model_dir, 'val_predictions.bc'))
conv_val_labels = load_array(os.path.join(model_dir, 'val_labels.bc'))

### Use data to train model

In [27]:
dropout = 0.4
model = fc_model(dropout)
epochs = 10
lr = 0.0001
model.compile(optimizer=Adam(lr), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
          conv_labels,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(conv_val_predictions, conv_val_labels))

Train on 4257 samples, validate on 651 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6235930810>

In [None]:
lr = 0.001
epochs = 15
model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
          conv_labels,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(conv_val_predictions, conv_val_labels))