# Statefarm Kaggle submission (fast.ai homework3)
What I'll need to do:
* set up data structure into sample, train, valid, test
* Import VGG16
* pop the top layer, train it
* set all fully connected layers to trainable
* Improvements:
    * play with dropout parameter
    * add data augmentation
    * stack multiple versions of the classifier
    * apply batch norm
    * have a setup that adjusts learning rate
    
These are general imports, always make sure to run these

In [22]:
import os
import zipfile
import shutil
import csv
os.environ["KERAS_BACKEND"] = "theano"
import keras
import numpy as np
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam

model_url = "http://files.fast.ai/models/"
model_name = "vgg16.h5"
cache_dir = "models"

# Data structure
First we set up the data structure, with proper:
* sample 
* train
* validation
* test 
directories in the `processed` directory

In [None]:
raw_path = os.path.join(os.getcwd(), os.pardir, 'data', 'raw')
processed_path = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')

# Make directories sample, valid, train, test, first check if this whole step is necessary
if os.path.exists(os.path.join(processed_path, 'sample')):
    print 'Sample directory already exists, no need to do data structuring!'
else:
    os.mkdir(os.path.join(processed_path, 'sample'))
    os.mkdir(os.path.join(processed_path, 'sample', 'train'))
    os.mkdir(os.path.join(processed_path, 'sample', 'valid'))
    os.mkdir(os.path.join(processed_path, 'valid'))
    
    # Extract Kaggle zipfiles to correct path
    print 'Extracting zips, this may take a while...'
    img_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'imgs.zip'), 'r')
    img_zip_handle.extractall(processed_path)
    img_zip_handle.close()
    
    csv_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'driver_imgs_list.csv.zip'), 'r')
    csv_zip_handle.extractall(processed_path)
    csv_zip_handle.close()
    print 'Done extracting zips!'
    
    # Set up sample directory structure
    for i in range(10):
        dirname = 'c' + str(i)
        os.mkdir(os.path.join(processed_path, 'sample', 'train', dirname))
        os.mkdir(os.path.join(processed_path, 'sample', 'valid', dirname))
        os.mkdir(os.path.join(processed_path, 'valid', dirname))
        
    data = np.genfromtxt(os.path.join(processed_path, 'driver_imgs_list.csv'), delimiter=',', dtype=None)
    data = data[1:,:]
    drivers = np.unique(data[:,0])
    sample_drivers = np.random.choice(drivers, 3, replace=False)
    for i in range(2):
        driver_name = sample_drivers[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            dest = os.path.join(processed_path, 'sample', 'train', driver_class)

            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            filenames = np.random.choice(class_columns[:,2], 3, replace=False)
            for filename in filenames:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                shutil.copyfile(src, os.path.join(dest, filename))

    # Fill in validation driver
    driver_name = sample_drivers[2]
    driver_columns = data[data[:,0] == driver_name]
    for j in range(10):
        driver_class = 'c' + str(j)
        dest = os.path.join(processed_path, 'sample', 'valid', driver_class)

        class_columns = driver_columns[driver_columns[:,1] == driver_class]
        filenames = np.random.choice(class_columns[:,2], 3, replace=False)
        for filename in filenames:
            src = os.path.join(processed_path, 'train', driver_class, filename)
            shutil.copyfile(src, os.path.join(dest, filename))
            
    # Throw 30% of train data into valid folder
    num_drivers = drivers.shape[0]
    validation_drivers_amount = int(np.floor(num_drivers*0.3))
    validation_drivers = np.random.choice(drivers, validation_drivers_amount, replace=False)

    for i in range(validation_drivers_amount):
        driver_name = validation_drivers[i]
        driver_columns = data[data[:,0] == driver_name]

        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                dest = os.path.join(processed_path, 'valid', driver_class, filename)
                shutil.move(src, dest)

# VGG16() setup boilerplate

In [42]:
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape([3,1,1])
def vgg_preprocess(x):
    x = x - vgg_mean
    return x[:,:,::-1]

def add_conv_block(model, layers, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    return model
    
def add_fc_block(model, dropout):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout))
    return model

In [43]:
class vgg16():
    def __init__(self, dropout=0.5):
        self.create(dropout)
        
    def create(self, dropout):
        model = self.model = Sequential()
        
        model.add(Lambda(vgg_preprocess, input_shape=(3, 244, 244), output_shape=(3, 244, 244)))
        
        model = add_conv_block(model, 2, 64)
        model = add_conv_block(model, 2, 128)
        model = add_conv_block(model, 3, 256)
        model = add_conv_block(model, 3, 512)
        model = add_conv_block(model, 3, 512)
        
        model.add(Flatten())
        
        model = add_fc_block(model, dropout)
        model = add_fc_block(model, dropout)
        model.add(Dense(1000, activation='softmax'))
        
        model = model.load_weights(get_file(model_name, model_url+model_name, cache_subdir=cache_dir))

# Load in data with generators
Here I set up the generators for the training and validation work

In [44]:
DEBUG = True
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')
train_batches = ImageDataGenerator().flow_from_directory(train_path, target_size=(244,244), batch_size=batch_size, shuffle=True)
val_batches = ImageDataGenerator().flow_from_directory(val_path, target_size=(244,244), batch_size=batch_size, shuffle=True)

Found 60 images belonging to 10 classes.
Found 30 images belonging to 10 classes.


# Finetuning the model
* Now the top layer must be popped and replaced with a 10-output, which will correspond to our hot-encoding.
* Then retrain model with new dense layer, which will be a good starting point for later fine tuning
* Save the model, so that we can start toying with it in the next section

In [45]:
lr = 0.001

model = vgg16(dropout=0.5).model
model.pop()
for layer in model.layers: layer.trainable=False
model.add(Dense(10, activation='softmax'))
model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(train_batches, 
                    samples_per_epoch=train_batches.nb_sample, 
                    nb_epoch=epochs, 
                    validation_data=val_batches, 
                    nb_val_samples=val_batches.nb_sample)
# model.model.save(os.path.join(model_dir, 'model_with_new_top.h5'))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11fa41dd0>

# New model architecture
Now that we have the trained model, we should probably make all the FC layers trainable. Additionally, we can start playing with:
* learning rate schedule
* batchnorm
* data augmentation
* setting different epochs
* some other kind of regularisation?

First, import the model from when we saved it. Then:
* Separate convolutional layers from fully connected ones
* Make a new convolutional architecture with whatever we want to implement
* Put them together
* Train

In [46]:
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape([3,1,1])

old_model = load_model(os.path.join(os.getcwd(), 
                                    os.pardir, 
                                    'models', 
                                    'model_with_new_top.h5'), 
                       custom_objects={'vgg_mean': vgg_mean})

## Batch normalisation
Let's implement batch normalisation first. It'll speed up our looking for the adequate learning rate. From [this link](https://github.com/fchollet/keras/issues/1802) we know that `BatchNorm()` needs to be applied after the activation.

In [54]:
flatten_index = [index for index,layer in enumerate(old_model.layers) if type(layer).__name__ == 'Flatten'][0]

conv_model = old_model.layers[1:flatten_index-1]

def add_fc_layers(conv_layers, dropout):
    model = Sequential()
    for layer in conv_layers:
        model.add(layer)
    
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
#     model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(128, activation='relu'))
#     model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(10, activation='softmax'))
    return model

dropout = 0.4
model = add_fc_layers(conv_model, dropout)
lr = 0.001
model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])

## Data augmentation
Let's set up new batch generators, this time making use of augmented data. Remember, we only seek to augment our **training** input, no need to augment validation input (there's no learning taking place).

In [52]:
DEBUG = True
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')

train_image_gen = ImageDataGenerator(rotation_range=5,
                                    shear_range = 1,
                                    zoom_range=1.2,
                                    channel_shift_range=0.3)
train_batches = train_image_gen.flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size, 
                                                    shuffle=True)

val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=True)

Found 60 images belonging to 10 classes.
Found 30 images belonging to 10 classes.


In [55]:
epochs = 3
lr = 0.001
model.optimizer.lr.set_value(lr)
model.fit_generator(train_batches, 
                    samples_per_epoch=train_batches.nb_sample, 
                    nb_epoch=epochs, 
                    validation_data=val_batches, 
                    nb_val_samples=val_batches.nb_sample)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x113c99690>

In [56]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_1 (Lambda)                (None, 3, 244, 244)   0           lambda_input_3[0][0]             
____________________________________________________________________________________________________
zeropadding2d_1 (ZeroPadding2D)  (None, 3, 246, 246)   0           lambda_1[0][0]                   
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 244, 244)  1792        zeropadding2d_1[3][0]            
____________________________________________________________________________________________________
zeropadding2d_2 (ZeroPadding2D)  (None, 64, 246, 246)  0           convolution2d_1[3][0]            
___________________________________________________________________________________________