## Imports

In [1]:
# adds parent directory to python path so we can access code located there
import os, sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: sys.path.append(nb_dir)
    
# core imports
from ohmeow_ml.keras_tf_util import *

# configure matplotlib
%matplotlib inline
    
# configure autoreload to re-load changed modules
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


## Setup

In [2]:
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir + '/data/'

rebuild_from_data_download = True

## Data Preparation

Copy a small amount of our data into a `sample` directory, with the exact same structure as our `train` directory -- this is *always* a good idea in *all* machine learning since we should do all of our initial testing using a dataset small enough that we never have to wait for it.

### Unzip training and test files

In [3]:
if (rebuild_from_data_download == True):
    # unzip kaggle imgs.zip -> /train and /test
    with zipfile.ZipFile(DATA_HOME_DIR + 'imgs.zip', 'r') as zip_ref:
        zip_ref.extractall(DATA_HOME_DIR)

Lets see how many examples we have for each class in the training dataset and the overall number of images we have in test.

** NOTE: ** There is a huge difference between the number of test images (~ 80k) and training images (~ 20k)

In [4]:
DATA_CLASSES = [ dir for dir in os.listdir(DATA_HOME_DIR+'train') ]

print('# of training images per class:')
for c in DATA_CLASSES:
    print(c, len(glob(DATA_HOME_DIR+'train/' + c + '/*')))
    
print('')

print('# of test images:',len(glob(DATA_HOME_DIR+'test/*')))

# of training images per class:
c4 2326
c5 2312
c9 2129
c8 1911
c0 2489
c1 2267
c6 2325
c3 2346
c2 2317
c7 2002

# of test images: 79726


### Set aside the images of 30% of the subjects (drivers) as our validation set

IMPORTANT: Per the competition rules ... "The train and test data are split on the drivers, such that one driver can only appear on either train or test set" ***AND SO*** we must do the same thing with our validatin set or we will get overly optimistic results

In [5]:
# lets take a look at our drivers
df = pd.read_csv(DATA_HOME_DIR+'driver_imgs_list.csv')
df.head()

Unnamed: 0,subject,classname,img
0,p002,c0,img_44733.jpg
1,p002,c0,img_72999.jpg
2,p002,c0,img_25094.jpg
3,p002,c0,img_69092.jpg
4,p002,c0,img_92629.jpg


In [6]:
# find individual subjects (drivers)
df_subjects = df.groupby(by='subject')['img'].count()
drivers = df_subjects.index.tolist()

# split the train/val 70/30 (so 30% of the subjects will make up our validation set)
val_n = math.floor(len(drivers) * 0.3)
val_drivers = np.random.choice(drivers, val_n)
val_drivers

array(['p015', 'p056', 'p026', 'p052', 'p050', 'p050', 'p016'], 
      dtype='<U4')

In [7]:
df_val = df[df['subject'].isin(val_drivers)]
df_val.head()

Unnamed: 0,subject,classname,img
2424,p015,c0,img_48693.jpg
2425,p015,c0,img_44903.jpg
2426,p015,c0,img_58514.jpg
2427,p015,c0,img_62307.jpg
2428,p015,c0,img_83984.jpg


### Create train, validation, test, and sample directories

In [8]:
if (rebuild_from_data_download == True):
    # create the directory structure required for machine learning with keras
    for c in DATA_CLASSES:
        make_dir(DATA_HOME_DIR + 'sample/train/' + c)
        make_dir(DATA_HOME_DIR + 'sample/valid/' + c)
        make_dir(DATA_HOME_DIR + 'sample/test/unknown')
        make_dir(DATA_HOME_DIR + 'valid/' + c)
        
    make_dir(DATA_HOME_DIR + 'test/unknown')

### Move training, validation, and test data into appropriate sub-directories

In [9]:
if (rebuild_from_data_download == True):
    # move VALIDATION datasets
    for idx, row in df_val.iterrows():
        f = "{0}/{1}/{2}".format(DATA_HOME_DIR+'train', row['classname'], row['img'])
        shutil.move(f, "{0}/{1}".format(DATA_HOME_DIR+'valid', row['classname']))
            
    # move TEST images into /unknown subdirectory
    for file in glob(DATA_HOME_DIR + 'test/*'):
        shutil.move(file, DATA_HOME_DIR + 'test/unknown')

### Copy a subset of training, validation, and test data into approprite sub-directories under /sample folder

In [10]:
if (rebuild_from_data_download == True):
    # create SAMPLE train and validation datasets
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'train/' + c), 150):
            shutil.copy(DATA_HOME_DIR + 'train/' + c + '/' + file, DATA_HOME_DIR + 'sample/train/' + c)
            
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'valid/' + c), 75):
            shutil.copy(DATA_HOME_DIR + 'valid/' + c + '/' + file, DATA_HOME_DIR + 'sample/valid/' + c)
            
    for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'test/unknown'), 500):
        shutil.copy(DATA_HOME_DIR + 'test/unknown/' + file, DATA_HOME_DIR + 'sample/test/unknown')