## Imports

In [1]:
# adds parent directory to python path so we can access code located there
import os, sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: sys.path.append(nb_dir)
    
# core imports
from ohmeow_ml.keras_tf_util import *

# other imports
from IPython.display import FileLink

# configure matplotlib
%matplotlib inline
    
# configure autoreload to re-load changed modules
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


## Define paths and global variables

In [2]:
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir + '/data/'
DATA_CLASSES = [ dir for dir in os.listdir(DATA_HOME_DIR+'train') ]

path = DATA_HOME_DIR
# path = DATA_HOME_DIR + 'sample/'
sample_path = DATA_HOME_DIR + 'sample/'

train_path = path + 'train/'
val_path = path + 'valid/'
test_path = path + 'test/'

models_path = path + 'models/'                      # save weights here
results_path = path + 'results/'                    # save predictions here
processed_data_path = path + 'preprocesed_data/'    # save preprocessed data used for training here

if not os.path.exists(models_path): os.makedirs(models_path)
if not os.path.exists(results_path): os.makedirs(results_path)
if not os.path.exists(processed_data_path): os.makedirs(processed_data_path)

In [3]:
batch_size = 64
# batch_size = 4

## Submission

In [4]:
def do_clip(arr, mx):
    return np.clip(arr, (1-mx)/9, mx)

def create_submission(preds, filename='subm.gz'):
    subm = do_clip(preds, 0.93)
    subm_file = results_path+filename
    
    batches = get_batches(train_path, batch_size=1, shuffle=False)
    classes = sorted(batches.class_indices, key=batches.class_indices.get)
    
    df_subm = pd.DataFrame(subm, columns=classes)
    df_subm.insert(0, 'img', [a[8:] for a in test_filenames])
    #print(df_subm.head())
    
    df_subm.to_csv(subm_file, index=False, compression='gzip')
    return subm_file

## Preprocess the data

We can save time by pre-processing the images (e.g., converting them to jpegs, resizing to 224x224) and saving them as a numpy array on the file system.  We can do the same for the train, validation, and test image class designations, filenames, and one-hot encoded labels

In [5]:
# get classes, one-hot encoded labels, and filenames
train_classes, train_labels, train_filenames = get_batch_info(train_path)
val_classes, val_labels, val_filenames = get_batch_info(val_path)
test_filenames = get_batch_info(test_path)[2]

Found 16951 images belonging to 10 classes.
Found 5473 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.


In [None]:
 # get image data
if not os.path.exists(processed_data_path+'train_data.bc'):
    train_data = get_data(train_path)
    save_array(processed_data_path+'train_data.bc', train_data)
else:
    train_data = load_array(processed_data_path+'train_data.bc')
    print('training data loaded ...')

if not os.path.exists(processed_data_path+'val_data.bc'):
    val_data = get_data(val_path)
    save_array(processed_data_path+'val_data.bc', val_data)
else:
    val_data = load_array(processed_data_path+'val_data.bc')
    print('validation data loaded ...')

# NOTE: with almost 80k records, trying to serialize the test set results in a memory error
# if not os.path.exists(processed_data_path+'test_data.bc'):
#     test_data = get_data(test_path)
#     save_array(processed_data_path+'test_data.bc', test_data)
# else:
#     test_data = load_array(processed_data_path+'test_data.bc')
#     print('test data loaded ...')

Create training/validation batches and also define "steps per epoch" for each ... defines the # of batches per epoch (see `model.fit_generator()`).

***ONLY RUN THIS CODE IF YOU NEED TO USE BATCHES INSTEAD OF PERSISTED IMAGE ARRAYS***

In [None]:
# OPTION 1: BUILD BATCHES FROM FILE SYSTEM
# train_batches = get_batches(train_path, batch_size=batch_size)
# val_batches = get_batches(val_path, batch_size=batch_size*2, shuffle=False)

# OPTION 2: BUILD BATCHES FROM IMAGE ARRAYS
# gen = image.ImageDataGenerator()
# train_batches = gen.flow(train_data, train_labels, batch_size=batch_size, shuffle=True)
# val_batches = gen.flow(val_data, val_labels, batch_size=batch_size*2, shuffle=False)

# DEFINE # OF STEPS TO TAKE IN FITTING BATCHES FOR BOTH TRAINING AND VALIDATION EXAMPLES
# epoch_steps = math.ceil(train_batches.n/train_batches.batch_size)
# val_steps = math.ceil(val_batches.n/val_batches.batch_size)

## Simple CNN

2 conv layers with max pooling + a simple dense network is a good simple CNN to start with

In [None]:
def simple_cnn():
    model = Sequential([
        BatchNormalization(axis=1, input_shape=(224,224,3)),
        Conv2D(32, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Conv2D(64, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Flatten(),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])

    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### Without Data Augmentation

In [None]:
limit_mem()
model = simple_cnn()

In [None]:
model.fit(train_data, train_labels, batch_size=batch_size, epochs=2, shuffle=True, 
          validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.optimizer.lr = 0.001
model.fit(train_data, train_labels, batch_size=batch_size, epochs=5, shuffle=True, 
          validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.save_weights(models_path+'simple_cnn_weights.h5') # val_acc = 0.4931

### With Data Augmentation

In [None]:
# get the best values
df_augs = pd.read_csv(sample_path+'data_augmentation_results.csv')
df_augs.sort_values('val_acc', ascending=False).groupby('aug').first()

In [None]:
gen_aug = image.ImageDataGenerator(channel_shift_range=10.0, height_shift_range=0.0, rotation_range=10.0, 
                                   shear_range=0.0, width_shift_range=0.05, zoom_range=0.0)

aug_batches = gen_aug.flow(train_data, train_labels, batch_size=batch_size, shuffle=True)

In [None]:
limit_mem()
model = simple_cnn()

In [None]:
epoch_steps = math.ceil(aug_batches.n/aug_batches.batch_size)
model.fit_generator(aug_batches, epoch_steps, epochs=2, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.optimizer.lr = 0.001
model.fit_generator(aug_batches, epoch_steps, epochs=4, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.optimizer.lr = 0.0001
model.fit_generator(aug_batches, epoch_steps, epochs=4, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.save_weights(models_path+'simple_cnn_da_weights.h5') # val_acc = 0.5620

## Complex CNN Architecutre

We are adding in regularization via Dropout so this will work better on full data set

In [None]:
def complex_cnn(p_do=0.5, n_dense_outputs=256):
    model = Sequential([
        BatchNormalization(axis=1, input_shape=(224,224,3)),
        Conv2D(32, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Conv2D(64, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Conv2D(128, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        
        Flatten(),
        Dense(n_dense_outputs, activation='relu'),
        BatchNormalization(),
        Dropout(p_do/4),
        Dense(n_dense_outputs, activation='relu'),
        BatchNormalization(),
        Dropout(p_do),
        Dense(10, activation='softmax')
    ])

    model.compile(Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
limit_mem()
model = complex_cnn()

In [None]:
gen_aug = image.ImageDataGenerator(channel_shift_range=10.0, height_shift_range=0.0, rotation_range=10.0, 
                                   shear_range=0.0, width_shift_range=0.05, zoom_range=0.0)

aug_batches = gen_aug.flow(train_data, train_labels, batch_size=batch_size, shuffle=True)
epoch_steps = math.ceil(aug_batches.n/aug_batches.batch_size)

In [None]:
model.fit_generator(aug_batches, epoch_steps, epochs=2, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.optimizer.lr = 0.001
model.fit_generator(aug_batches, epoch_steps, epochs=4, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.optimizer.lr = 0.0001
model.fit_generator(aug_batches, epoch_steps, epochs=6, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.optimizer.lr = 0.00001
model.fit_generator(aug_batches, epoch_steps, epochs=8, validation_data=(val_data, val_labels), verbose=2)

In [None]:
model.save_weights(models_path+'complex_cnn_da_weights.h5') # val_acc = ~0.63

## Finetune Pre-Trained Models

### 1. Train a linear classifier using the pre-computed output from 2nd to last layer

In [None]:
limit_mem()
model = VGG19(weights='imagenet', include_top=True)

In [None]:
# pop last layer and set model.outputs = to that of the now last layer
model.layers.pop()

# model.layers[-1].outbound_nodes = [] ... this is not needed
model.outputs = [model.layers[-1].output]

In [None]:
# model.summary()

#### Pre-compute output for train, validation, test data

In [None]:
# A. precompute the 2nd to last layer for training and validation data sets
if not os.path.exists(processed_data_path+'train_features_ll.bc'):
    train_features_ll = model.predict(train_data, 4)
    val_features_ll = model.predict(val_data, 4)
    
    save_array(processed_data_path+'train_features_ll.bc', train_features_ll)
    save_array(processed_data_path+'val_features_ll.bc', val_features_ll)
else:
    train_features_ll = load_array(processed_data_path+'train_features_ll.bc')
    val_features_ll = load_array(processed_data_path+'val_features_ll.bc')
    
print('training data:', train_features_ll.shape)
print('validation data:', val_features_ll.shape)

# B. do the same for augmented training data ... make this 5-10x larger
if not os.path.exists(processed_data_path+'da_train_features_ll.bc'):
    da_gen = image.ImageDataGenerator(channel_shift_range=10.0, height_shift_range=0.0, rotation_range=10.0, 
                                   shear_range=0.0, width_shift_range=0.05, zoom_range=0.0)

    # shuffle=False because we are going to have to add labels later for however many 
    # augmented sets of the training data
    da_batches = get_batches(train_path, da_gen, batch_size=4, shuffle=False) 
    da_train_features_ll = np.concatenate(
        [ model.predict_generator(da_batches, (da_batches.n/da_batches.batch_size), verbose=2) for i in range(5)])
    
    save_array(processed_data_path+'da_train_features_ll.bc', da_train_features_ll)
else:
    da_train_features_ll = load_array(processed_data_path+'da_train_features_ll.bc')
    
print('augmented data:', da_train_features_ll.shape)

# C. do the same for test data
if not os.path.exists(processed_data_path+'test_features_ll.bc'):
    test_batches = get_batches(test_path, batch_size=4, shuffle=False)
    test_features_ll = model.predict_generator(test_batches, (test_batches.n/test_batches.batch_size), verbose=2)
    save_array(processed_data_path+'test_features_ll.bc', test_features_ll)
else:
    test_features_ll = load_array(processed_data_path+'test_features_ll.bc')
    
print('test data:', test_features_ll.shape)

In [None]:
all_train_features_ll = np.concatenate([da_train_features_ll, train_features_ll])
all_train_labels_ll = np.concatenate([train_labels]*6)

print('all training features shape:', all_train_features_ll.shape)
print('all training labels shape:', all_train_labels_ll.shape)

#### Finetune

In [None]:
for layer in model.layers: layer.trainable = False
ft_ll_model = Sequential([ Dense(10, activation='softmax', input_shape=model.layers[-1].output_shape[1:]) ])

In [None]:
ft_ll_model.compile(optimizer=Adam(lr=1e-05), loss='categorical_crossentropy', metrics=['accuracy'])
ft_ll_model.fit(all_train_features_ll, all_train_labels_ll, batch_size=batch_size, epochs=15, 
          validation_data=(val_features_ll, val_labels), verbose=2)

In [None]:
ft_ll_model.optimizer.lr = 0.001
ft_ll_model.fit(all_train_features_ll, all_train_labels_ll, batch_size=batch_size, epochs=5, 
          validation_data=(val_features_ll, val_labels), verbose=2)

In [None]:
ft_ll_model.save_weights(models_path+'ft_ll_model_weights.h5') # val_acc = 0.33

### 2. Precompute the convolutional and use in FC NN

In [10]:
limit_mem()
model = VGG19(include_top=False, weights='imagenet', input_shape=(224,224,3)) # must include input_shape if include_top=False

In [None]:
# model.summary()

#### Precompute output for train, validation, test data

In [6]:
# A. precompute the 2nd to last layer for training and validation data sets
if not os.path.exists(processed_data_path+'train_features_conv.bc'):
    train_features_conv = model.predict(train_data, 4)
    val_features_conv = model.predict(val_data, 4)
    
    save_array(processed_data_path+'train_features_conv.bc', train_features_conv)
    save_array(processed_data_path+'val_features_conv.bc', val_features_conv)
else:
    train_features_conv = load_array(processed_data_path+'train_features_conv.bc')
    val_features_conv = load_array(processed_data_path+'val_features_conv.bc')
    
print('training data:', train_features_conv.shape)
print('validation data:', val_features_conv.shape)

# B. do the same for augmented training data ... make this 5-10x larger
if not os.path.exists(processed_data_path+'da_train_features_conv.bc'):
    da_gen = image.ImageDataGenerator(channel_shift_range=10.0, height_shift_range=0.0, rotation_range=10.0, 
                                   shear_range=0.0, width_shift_range=0.05, zoom_range=0.0)

    # shuffle=False because we are going to have to add labels later for however many 
    # augmented sets of the training data
    da_batches_conv = get_batches(train_path, da_gen, batch_size=4, shuffle=False) 
    da_train_features_conv = np.concatenate(
        [ model.predict_generator(da_batches_conv, (da_batches_conv.n/da_batches_conv.batch_size), verbose=2) for i in range(5) ])
    
    save_array(processed_data_path+'da_train_features_conv.bc', da_train_features_conv)
else:
    da_train_features_conv = load_array(processed_data_path+'da_train_features_conv.bc')
    
print('augmented data:', da_train_features_conv.shape)

# C. do the same for test data
if not os.path.exists(processed_data_path+'test_features_conv.bc'):
    test_batches_conv = get_batches(test_path, batch_size=4, shuffle=False)
    test_features_conv = model.predict_generator(test_batches_conv, (test_batches_conv.n/test_batches_conv.batch_size), verbose=2)
    save_array(processed_data_path+'test_features_conv.bc', test_features_conv)
else:
    test_features_conv = load_array(processed_data_path+'test_features_conv.bc')
    
print('test data:', test_features_conv.shape)

training data: (16951, 7, 7, 512)
validation data: (5473, 7, 7, 512)
augmented data: (84755, 7, 7, 512)
test data: (79726, 7, 7, 512)


In [7]:
all_train_features_conv = np.concatenate([da_train_features_conv, train_features_conv])
all_train_labels_conv = np.concatenate([train_labels]*6)

print('all training features shape:', all_train_features_conv.shape)
print('all training labels shape:', all_train_labels_conv.shape)

all training features shape: (101706, 7, 7, 512)
all training labels shape: (101706, 10)


#### Finetune

In [11]:
def build_fc_layers(dropout_p=0.5, dense_output=256):
    return [
        Flatten(input_shape=model.layers[-1].output_shape[1:]),
        Dropout(dropout_p),
        Dense(dense_output, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_p),
        Dense(dense_output, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_p),
        Dense(10, activation='softmax')
    ]

In [None]:
fc_model = Sequential(build_fc_layers(0.6, 512))
fc_model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
fc_model.fit(all_train_features_conv, all_train_labels_conv, batch_size=batch_size, epochs=2,
            validation_data=(val_features_conv, val_labels), verbose=2)

In [None]:
fc_model.optimizer.lr = 0.01
fc_model.fit(all_train_features_conv, all_train_labels_conv, batch_size=batch_size, epochs=4,
            validation_data=(val_features_conv, val_labels), verbose=2)

In [None]:
fc_model.optimizer.lr = 0.001
fc_model.fit(all_train_features_conv, all_train_labels_conv, batch_size=batch_size, epochs=4,
            validation_data=(val_features_conv, val_labels), verbose=2)

In [None]:
fc_model.optimizer.lr = 0.0001
fc_model.fit(all_train_features_conv, all_train_labels_conv, batch_size=batch_size, epochs=4,
            validation_data=(val_features_conv, val_labels), verbose=2)

In [34]:
fc_model.save_weights(models_path+'ft_fc_model_weights_do-pt6_d-512.h5') # val_acc = ???

#### Evaluate and create submission

In [None]:
fc_model.evaluate(val_features_conv, val_labels, batch_size=batch_size, verbose=2)

In [32]:
preds = fc_model.predict(test_features_conv, batch_size=batch_size, verbose=2)

In [33]:
subm_file = create_submission(preds, 'ft_fc_model_subm01.gz')
FileLink(subm_file)

Found 16951 images belonging to 10 classes.


#### Add pseudo-labeling

In [25]:
limit_mem()
fc_model = Sequential(build_fc_layers(0.6, 512))
fc_model.load_weights(models_path+'ft_fc_model_weights_do-pt6_d-512.h5')

In [15]:
pseudo_val_labels = fc_model.predict(val_features_conv, batch_size=4, verbose=2)
pseudo_test_labels = fc_model.predict(test_features_conv, batch_size=4, verbose=2)

combo_train_val_labels = np.concatenate([pseudo_val_labels, all_train_labels_conv])
combo_train_val_feat = np.concatenate([val_features_conv, all_train_features_conv])

combo_all_labels = np.concatenate([pseudo_test_labels, combo_train_val_labels])
combo_all_feat = np.concatenate([test_features_conv, combo_train_val_feat])

In [26]:
fc_model = Sequential(build_fc_layers(0.6, 512))
fc_model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [27]:
fc_model.fit(combo_all_feat, combo_all_labels, batch_size=batch_size, epochs=2,
            validation_data=(val_features_conv, val_labels), verbose=2)

Train on 186905 samples, validate on 5473 samples
Epoch 1/2
83s - loss: 0.6293 - acc: 0.8384 - val_loss: 0.7780 - val_acc: 0.7818
Epoch 2/2
83s - loss: 0.4503 - acc: 0.8920 - val_loss: 0.8102 - val_acc: 0.7711


<keras.callbacks.History at 0x7fc7a62f3860>

In [28]:
fc_model.optimizer.lr = 0.01
fc_model.fit(combo_all_feat, combo_all_labels, batch_size=batch_size, epochs=4,
            validation_data=(val_features_conv, val_labels), verbose=2)

Train on 186905 samples, validate on 5473 samples
Epoch 1/4
83s - loss: 0.4250 - acc: 0.8992 - val_loss: 0.8661 - val_acc: 0.7749
Epoch 2/4
83s - loss: 0.4076 - acc: 0.9036 - val_loss: 0.8042 - val_acc: 0.7780
Epoch 3/4
83s - loss: 0.3945 - acc: 0.9080 - val_loss: 0.8524 - val_acc: 0.7687
Epoch 4/4
82s - loss: 0.3845 - acc: 0.9116 - val_loss: 0.8475 - val_acc: 0.7680


<keras.callbacks.History at 0x7fc7a75cbcf8>

In [30]:
fc_model.optimizer.lr = 0.001
fc_model.fit(combo_all_feat, combo_all_labels, batch_size=batch_size, epochs=4,
            validation_data=(val_features_conv, val_labels), verbose=2)

Train on 186905 samples, validate on 5473 samples
Epoch 1/4
83s - loss: 0.3748 - acc: 0.9139 - val_loss: 0.8413 - val_acc: 0.7696
Epoch 2/4
83s - loss: 0.3702 - acc: 0.9150 - val_loss: 0.8931 - val_acc: 0.7570
Epoch 3/4
83s - loss: 0.3637 - acc: 0.9168 - val_loss: 0.8303 - val_acc: 0.7749
Epoch 4/4
83s - loss: 0.3599 - acc: 0.9181 - val_loss: 0.8637 - val_acc: 0.7736


<keras.callbacks.History at 0x7fc7a6171b70>

In [31]:
fc_model.optimizer.lr = 0.0001
fc_model.fit(combo_all_feat, combo_all_labels, batch_size=batch_size, epochs=4,
            validation_data=(val_features_conv, val_labels), verbose=2)

Train on 186905 samples, validate on 5473 samples
Epoch 1/4
83s - loss: 0.3571 - acc: 0.9186 - val_loss: 0.8393 - val_acc: 0.7680
Epoch 2/4
83s - loss: 0.3533 - acc: 0.9195 - val_loss: 0.8596 - val_acc: 0.7654
Epoch 3/4
83s - loss: 0.3497 - acc: 0.9207 - val_loss: 0.8876 - val_acc: 0.7700
Epoch 4/4
83s - loss: 0.3464 - acc: 0.9215 - val_loss: 0.8367 - val_acc: 0.7707


<keras.callbacks.History at 0x7fc7a60da470>