# Create Validation Set
First, we move 2 drivers out of the training folder so that we can use them in validation.

In [3]:
path = 'data/statefarm'
trainpath = path + '/train'
validpath = path + '/valid'
models_path = path+'/models/'

In [4]:
import os

if not os.path.exists(models_path):
    os.makedirs(models_path)

for i in range(0, 10):
    cpath = validpath + '/c%d' % i
    if not os.path.exists(cpath):
        os.makedirs(cpath)

In [29]:
import csv
drivers = dict()
with open(path + '/driver_imgs_list.csv') as driverlist:
    reader = csv.reader(driverlist)
    for driver, classname, image in reader:
        if not drivers.has_key(driver):
            drivers[driver] = list()
        drivers[driver].append(os.path.join(classname, image))

['p022', 'p049']


In [35]:
validdrivers = 2
driverimages = drivers.values()[0:validdrivers]
for images in driverimages:
    for image in images:
        if os.path.exists(os.path.join(trainpath, image)):
            os.rename(os.path.join(trainpath, image), os.path.join(validpath, image))

In [10]:
samplepath = path+'/sample'
import os, shutil

def copy_samples(subpath, sample_subpath):
    if not os.path.exists(sample_subpath):
        os.makedirs(sample_subpath)
    for i in range(0, 10):
        path_i = '/c%d' % i
        subpath_i = subpath + path_i

        sample_subpath_i = sample_subpath + path_i
        if not os.path.exists(sample_subpath_i):
            os.makedirs(sample_subpath_i)

        samplefiles = os.listdir(subpath_i)
        for image_path in samplefiles[0:10]:
            shutil.copyfile(os.path.join(subpath_i, image_path), os.path.join(sample_subpath_i, image_path))
            
sample_train_path = samplepath + '/train'
copy_samples(trainpath, sample_train_path)

sample_valid_path = samplepath + '/valid'
copy_samples(validpath, sample_valid_path)

# Build the VGG16 Model

In [5]:
download_path = 'http://files.fast.ai/models/'

In [6]:
from keras import backend as K
from keras.models import Sequential
from keras.utils.data_utils import get_file
from keras.layers.core import Lambda, Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D
import numpy as np

vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape((3, 1, 1))
def vgg_preprocess(x):
    # subtracts the mean so that we get a 0-centered value
    x = x - vgg_mean
    
    # reverses the axis, since most pretrained data comes from OpenCV, which uses BGR rather than RGB
    return x[:, ::-1]

def vgg_convblock(model, layers, filters):
    for _ in xrange(0, layers):
        model.add(Convolution2D(filters, 3, 3, border_mode='same', activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

model = Sequential()
model.add(Lambda(vgg_preprocess, input_shape=(3, 224, 224), output_shape=(3,224,224)))

vgg_convblock(model, 2, 64)
vgg_convblock(model, 2, 128)
vgg_convblock(model, 3, 256)
vgg_convblock(model, 3, 512)
vgg_convblock(model, 3, 512)

model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1000, activation='softmax'))

weights_file = get_file('vgg16.h5', download_path+'vgg16.h5', cache_subdir='models')
model.load_weights(weights_file)
model.summary()

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1080 Ti (CNMeM is disabled, cuDNN 5110)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_1 (Lambda)                (None, 3, 224, 224)   0           lambda_input_1[0][0]             
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 224, 224)  1792        lambda_1[0][0]                   
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 64, 224, 224)  36928       convolution2d_1[0][0]            
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 64, 112, 112)  0           convolution2d_2[0][0]            
___________________________________________________________________________________________

# Finetune the model

In [7]:
from keras.layers.normalization import BatchNormalization

for layer in model.layers[0:-5]:
    layer.trainable = False

model.pop()
model.add(BatchNormalization())
model.add(Dense(10, activation='softmax'))
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_1 (Lambda)                (None, 3, 224, 224)   0           lambda_input_1[0][0]             
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 224, 224)  1792        lambda_1[0][0]                   
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 64, 224, 224)  36928       convolution2d_1[0][0]            
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 64, 112, 112)  0           convolution2d_2[0][0]            
___________________________________________________________________________________________

In [8]:
if os.path.exists(models_path+'vgg16_bn.h5'):
    model.load_weights(models_path+'vgg16_bn.h5')

In [11]:
from keras.optimizers import Adam
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [47]:
from keras.preprocessing.image import ImageDataGenerator
def batch_gen(path, class_mode='categorical', shuffle=True):
    return ImageDataGenerator().flow_from_directory(path, target_size=(224, 224), batch_size=8, class_mode=class_mode, shuffle=shuffle)

In [13]:
train_gen = batch_gen(trainpath)
valid_gen = batch_gen(validpath)

Found 20180 images belonging to 10 classes.
Found 2244 images belonging to 10 classes.


In [29]:
model.fit_generator(train_gen, samples_per_epoch=train_gen.nb_sample, nb_epoch=1, 
                    validation_data=valid_gen, nb_val_samples=valid_gen.nb_sample)

Epoch 1/1

KeyboardInterrupt: 

In [22]:
model.save_weights(models_path+'vgg16_bn.h5')

# Generate submission

In [48]:
testpath = path + '/test'
test_gen = batch_gen(testpath, class_mode=None, shuffle=False)

Found 79726 images belonging to 1 classes.


In [49]:
predictions = model.predict_generator(test_gen, test_gen.nb_sample)

In [51]:
import csv
with open(path+'/submission.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['img','c0','c1','c2','c3','c4','c5','c6','c7','c8','c9'])
    for filename, file_predictions in zip(test_gen.filenames, predictions):
        writer.writerow([os.path.basename(filename)] + file_predictions.tolist())

In [79]:
?? model

In [80]:
from sklearn.metrics import confusion_matrix

In [96]:
?? confusion_matrix

In [95]:
len(valid_gen.classes)

2244