## Imports

In [14]:
# adds parent directory to python path so we can access code located there
import os, sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: sys.path.append(nb_dir)
    
# core imports
from ohmeow_ml.keras_tf_util import *

# configure matplotlib
%matplotlib inline
    
# configure autoreload to re-load changed modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Setup

In [31]:
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir + '/data/'
DATA_CLASSES = [ dir for dir in os.listdir(DATA_HOME_DIR+'train') ]

rebuild_from_data_download = True

Lets see how many examples we have for each class in the training dataset and the overall number of images we have in test.

** NOTE: ** There is a huge difference between the number of test images (~ 80k) and training images (~ 20k)

In [32]:
print('# of training images per class:')
for c in DATA_CLASSES:
    print(c, len(glob(DATA_HOME_DIR+'train/' + c + '/*')))
    
print('')

print('# of test images:',len(glob(DATA_HOME_DIR+'test/*')))

# of training images per class:
c0 2489
c1 2267
c2 2317
c3 2346
c4 2326
c5 2312
c6 2325
c7 2002
c8 1911
c9 2129

# of test images: 79727


## Data Preparation

Copy a small amount of our data into a `sample` directory, with the exact same structure as our `train` directory -- this is *always* a good idea in *all* machine learning since we should do all of our initial testing using a dataset small enough that we never have to wait for it.

### Unzip training and test files

In [36]:
if (rebuild_from_data_download == True):
    # unzip kaggle imgs.zip -> /train and /test
    with zipfile.ZipFile(DATA_HOME_DIR + 'imgs.zip', 'r') as zip_ref:
        zip_ref.extractall(DATA_HOME_DIR)

### Create train, validation, test, and sample directories

In [37]:
if (rebuild_from_data_download == True):
    # create the directory structure required for machine learning with keras
    for c in DATA_CLASSES:
        make_dir(DATA_HOME_DIR + 'sample/train/' + c)
        make_dir(DATA_HOME_DIR + 'sample/valid/' + c)
        make_dir(DATA_HOME_DIR + 'valid/' + c)
        
    make_dir(DATA_HOME_DIR + 'test/unknown')

### Move training, validation, and test data into appropriate sub-directories

In [38]:
if (rebuild_from_data_download == True):
    # move VALIDATION datasets
    for c in DATA_CLASSES:
        # get number of images for class in training set
        num_train_files = len(glob(DATA_HOME_DIR+'train/' + c + '/*'))
        
        # make number of validation examples for each class proportional to # of images in training (30%)
        num_validation_files = int(num_train_files * 0.30)
        
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'train/' + c + '/'), num_validation_files):
            shutil.move(DATA_HOME_DIR + 'train/' + c + '/' + file, DATA_HOME_DIR + 'valid/' + c)
            
    # move TEST images into /unknown subdirectory
    for file in glob(DATA_HOME_DIR + 'test/*'):
        shutil.move(file, DATA_HOME_DIR + 'test/unknown')

### Copy a subset of training data into approprite sub-directories under /sample folder

In [39]:
if (rebuild_from_data_download == True):
    # create SAMPLE train and validation datasets
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'train/' + c), 100):
            shutil.copy(DATA_HOME_DIR + 'train/' + c + '/' + file, DATA_HOME_DIR + 'sample/train/' + c)
            
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'valid/' + c), 30):
            shutil.copy(DATA_HOME_DIR + 'valid/' + c + '/' + file, DATA_HOME_DIR + 'sample/valid/' + c)