    Wayne Nixalo  -  20 May 2017
FAI1 - Practical Deep Learning I - Week 3 HW: Kaggle StateFarm Distracted Driver Competition

#### Imports

In [1]:
import keras
import os, sys
import numpy as np
from glob import glob
from keras.optimizers import Adam
from keras.layers.core import Dense
from keras.preprocessing import image

Using Theano backend.
Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX 870M (0000:01:00.0)


In [2]:
# will need this to access any libraries in superdirectories
sys.path.insert(1, os.path.join(os.getcwd(), '../utils'))
import utils
from vgg16 import Vgg16

In [3]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

#### 1. Run this the First Time Only

Download the Data and get it into the right directories. Fortunately SF already organized the data, so it's just a matter of assigning path variables. NOTE: kaggle-cli needs to be set up beforhand. Also path vars must be assigned each time this notebook is run.

In [4]:
HOME_DIR = os.getcwd()
DATA_DIR = HOME_DIR + '/data'
TRAIN_DIR = DATA_DIR + '/train'
VAL_DIR = DATA_DIR + '/valid'
TEST_DIR = DATA_DIR + '/test'

In [None]:
# create the validation directories
os.mkdir(VAL_PATH)
for i in xrange(10):
    os.mkdir(VAL_PATH + '/c' + str(i))

# # another way to do this:
# %mkdir $VAL_PATH
# for i in xrange(10):
#     %mkdir $VAL_PATH/c"$i"

#### 2. Run this if you don't have an Accurate Validation Set

Grab a random permutation from the training data for validation. Do this until validation accuracy matches test accuracy. Also see: http://stackoverflow.com/questions/2632205/how-to-count-the-number-of-files-in-a-directory-using-python

In [None]:
# %cd $TRAIN_PATH

# VAL_PORTION = 0.2
# for i in xrange(10):
#     %cd c"$i"
#     g = glob('*.jpg')
#     number = len(g)
#     shuff = np.random.permutation(g)
#     for n in xrange(int(number * VAL_PORTION)):
#         os.rename(shuff[n], VAL_PATH + '/c' + str(i) + '/' + shuff[n])
#     % cd ..

def reset_valid():
    """Moves all images in validation set back to 
    their respective classes in the training set."""
    for i in xrange(10):
        %mv $VALID_DIR/c"$i"/*.jpg $TRAIN_DIR/c"$i"/*.jpg

# modified from: http://forums.fast.ai/t/statefarm-kaggle-comp/183/20
def set_valid(number=1):
    if number < 0: number = 0
    for n in xrange(number):
        # read CSV file into Pandas DataFrame
        dil = pd.read_csv(data_path + 'driver_imgs_list.csv')
        # group frame by subject in image
        grouped_subjects = dil.groupby('subject')
        # pick <number> subjects at random
        subject = groups.keys()[np.random.randint(0, high=len(groups))] # <-- groups?
        # get the group assoc w/ subject
        group = grouped.get_group(subject)
        # loop over gropu & move imgs to validation dir
        for (subject, clssnm, img) in group.values:
            source = '{}train/{}/{}'.format(data_path, clssnm, img)
            target = source.replace('trian', 'valid')
            print('mv {} {}'.format(source, target))
            os.rename(source, target)

In [5]:
# some more setup
data_path    = DATA_DIR  + '/'
train_path   = TRAIN_DIR + '/'
valid_path   = VAL_DIR   + '/'
test_path    = TEST_DIR  + '/'
results_path = DATA_DIR  + '/results/'

In [6]:
# looks like batch size of 64 is just past what my GPU can handle
# would using bcolz to save precomputed arrays help?
batch_size=48
target_size=(224,224) # for gen.flow_from_directory(..)

In [7]:
# batch generator to feed data into the model
gen = image.ImageDataGenerator()
trn_batches = gen.flow_from_directory(train_path, target_size=target_size,
                class_mode='categorical', shuffle=True, batch_size=batch_size)
val_batches = gen.flow_from_directory(valid_path, target_size=target_size,
                class_mode='categorical', shuffle=False, batch_size=batch_size)

Found 17943 images belonging to 10 classes.
Found 4481 images belonging to 10 classes.


In [8]:
trn_batches.n

17943

NOTE: I'll want a way to clear GPU memory in the future. Right now all I know is restarting the kernel.

In [9]:
# load the VGG model, download its weights, and finetune it to the data
VGG = Vgg16()
VGG.model.pop()
for layer in VGG.model.layers: layer.trainable = False
VGG.model.add(Dense(10, activation='softmax'))
VGG.model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
# run the model until it overfits
VGG.model.optimizer.lr = 0.001
VGG.model.fit_generator(trn_batches, trn_batches.n, nb_epoch=1, verbose=1,
                       validation_data=val_batches, nb_val_samples=val_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f5febebd3d0>

In [10]:
def train_model(lr=0.001, epochs=1, verbose=0):
    VGG.model.optimizer.lr=lr
    VGG.model.fit_generator(trn_batches, trn_batches.n, nb_epoch=epochs, verbose=verbose,
                           validation_data=val_batches, nb_val_samples=val_batches.n)

In [None]:
train_model(lr=0.1, epochs=1, verbose=1)

In [11]:
# saving weights
VGG.model.save_weights(data_path + 'finetune01.h5')

In [10]:
VGG.model.load_weights(data_path + 'finetune01.h5')

In [11]:
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, batch_size=4, class_mode='categorical',
                target_size=(224,224)):
    return gen.flow_from_directory(dirname, target_size=target_size,
            class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

def get_data(path, target_size=(224,224)):
    batches = get_batches(path, shuffle=False, batch_size=1, class_mode=None, target_size=target_size)
    return np.concatenate([batches.next() for i in xrange(batches.nb_sample)])

In [12]:
# output test data predictions
gen = image.ImageDataGenerator()
tst_batches = gen.flow_from_directory(test_path, target_size=target_size,
                class_mode='categorical', shuffle=False, batch_size=batch_size*2)
# predictions = VGG.model.predict_on_batch(tst_batches)
# predictions = VGG.model.predict(tst_batches, batch_size=batch_size*2, verbose=1)



Found 79726 images belonging to 1 classes.


In [13]:
tst_batches.n
tst_batches.nb_sample

79726

In [15]:
predictions = VGG.test(test_path, batch_size=32)

Found 79726 images belonging to 1 classes.


In [16]:
save_array(results_path + 'raw_predictions01.bc', predictions[1])

In [17]:
len(predictions[1])

79726

In [18]:
predictions[1].shape

(79726, 10)

In [19]:
preds = predictions[1]

In [20]:
filenames = tst_batches.filenames
# ids = np.array([str(f[8:f.find('.')]) for f in filenames])
ids = np.array([str(f[8:]) for f in filenames])

In [21]:
print(ids.shape)
print(preds.shape)

(79726,)
(79726, 10)


In [22]:
import pandas as pd

In [23]:
# submissions = np.stack([ids, preds], axis=1)

# couldn't get the older method of using np.stack to work, so trying pandas
classes = sorted(trn_batches.class_indices, key=trn_batches.class_indices.get)
submission = pd.DataFrame(preds, columns=classes)
# submission.insert(0, 'img', [f[12:] for f in filenames])
submission.insert(0, 'img', [f[8:] for f in filenames])
submission.head()
submission.to_csv(results_path + 'submission.csv', index=False, compression=None)

In [24]:
# ??pd.DataFrame.to_csv

In [25]:
from IPython.display import FileLink
FileLink(results_path + 'submission.csv')

In [1]:
??submission.insert

Object `submission.insert` not found.
