## Setup

Basic config steps.  

In [1]:
import os, json, shutil, zipfile, random as py_random
from glob import glob

In [2]:
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir + '/data/'
DATA_CLASSES = ['cats', 'dogs']

num_validation_files = 1500
num_sample_train_files = 150
num_sample_val_files = 15

rebuildFromZipFiles = True

In [3]:
def make_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
    else:
        shutil.rmtree(dir)
        os.makedirs(dir)

## Data Preparation

Copy a small amount of our data into a `sample` directory, with the exact same structure as our `train` directory -- this is *always* a good idea in *all* machine learning since we should do all of our initial testing using a dataset small enough that we never have to wait for it.

### Unzip training and test files

In [4]:
if (rebuildFromZipFiles == True):
    # unzip kaggle train.zip -> /train
    with zipfile.ZipFile(DATA_HOME_DIR + 'train.zip', 'r') as zip_ref:
        zip_ref.extractall(DATA_HOME_DIR)
        
    # unzip kaggle test.zip -> /test
    with zipfile.ZipFile(DATA_HOME_DIR + 'test.zip', 'r') as zip_ref:
        zip_ref.extractall(DATA_HOME_DIR)

### Create train, validation, test, and sample directories

In [5]:
if (rebuildFromZipFiles == True):
    # create the directory structure required for machine learning with keras
    for c in DATA_CLASSES:
        make_dir(DATA_HOME_DIR + 'sample/train/' + c)
        make_dir(DATA_HOME_DIR + 'sample/valid/' + c)
        make_dir(DATA_HOME_DIR + 'train/' + c)
        make_dir(DATA_HOME_DIR + 'valid/' + c)
        
    make_dir(DATA_HOME_DIR + 'test/unknown')

### Move training, validation, and test data into appropriate sub-directories

In [6]:
if (rebuildFromZipFiles == True):
    # move TRAINING images into appropriate subdirectories
    for c in DATA_CLASSES:
        for file in glob(DATA_HOME_DIR + 'train/' + c[:-1] + '*'):
            shutil.move(file, DATA_HOME_DIR + 'train/' + c)
            
    # move VALIDATION datasets
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'train/' + c + '/'), num_validation_files):
            shutil.move(DATA_HOME_DIR + 'train/' + c + '/' + file, DATA_HOME_DIR + 'valid/' + c)
            
    # move TEST images into /unknown subdirectory
    for file in glob(DATA_HOME_DIR + 'test/*'):
        shutil.move(file, DATA_HOME_DIR + 'test/unknown')

### Copy a subset of training data into approprite sub-directories under /sample folder

In [7]:
if (rebuildFromZipFiles == True):
    # create SAMPLE train and validation datasets
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'train/' + c), num_sample_train_files):
            shutil.copy(DATA_HOME_DIR + 'train/' + c + '/' + file, DATA_HOME_DIR + 'sample/train/' + c)
            
    for c in DATA_CLASSES:
        for file in py_random.sample(os.listdir(DATA_HOME_DIR + 'valid/' + c), num_sample_val_files):
            shutil.copy(DATA_HOME_DIR + 'valid/' + c + '/' + file, DATA_HOME_DIR + 'sample/valid/' + c)