In [4]:
from os import listdir
from numpy import array
from keras import initializers
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential, model_from_json
from keras.utils import to_categorical, Sequence
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import RMSprop, Adamax
from keras.layers.convolutional import Conv2D
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense, GlobalAveragePooling2D

from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.inception_v3 import InceptionV3

from keras.applications.mobilenet import MobileNet
import numpy as np

# import the capsule layers stuff
from capsulelayers import *

from keras import backend as K
from nltk.translate.bleu_score import corpus_bleu

In [5]:
K.clear_session()  # free memory from previous runs ???
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [6]:
dir_name = 'resources/train/'

# Read a file and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_data(data_dir):
    text = []
    images = []
    # Load all the files and order them
    all_filenames = listdir(data_dir)
    all_filenames.sort()
    for filename in (all_filenames):
        if filename[-3:] == "npz":
            # Load the images already prepared in arrays
            image = np.load(data_dir+filename)
            images.append(image['features'])
        else:
            # Load the boostrap tokens and rap them in a start and end tag
            syntax = '<START> ' + load_doc(data_dir+filename) + ' <END>'
            # Seperate all the words with a single space
            syntax = ' '.join(syntax.split())
            # Add a space after each comma
            syntax = syntax.replace(',', ' ,')
            text.append(syntax)
    images = np.array(images, dtype=float)
    return images, text

#train_features, texts = load_data(dir_name)

In [7]:
# Initialize the function to create the vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)
# Create the vocabulary 
tokenizer.fit_on_texts([load_doc('resources/bootstrap.vocab')])

# Add one spot for the empty word in the vocabulary 
vocab_size = len(tokenizer.word_index) + 1
# Map the input sentences into the vocabulary indexes
#train_sequences = tokenizer.texts_to_sequences(texts)
# The longest set of boostrap tokens
#max_sequence = max(len(s) for s in train_sequences)
# Specify how many tokens to have in each input sentence
max_length = 48

def preprocess_data(sequences, features):
    X, y, image_data = list(), list(), list()
    for img_no, seq in enumerate(sequences):
        for i in range(1, len(seq)):
            # Add the sentence until the current count(i) and add the current count to the output
            in_seq, out_seq = seq[:i], seq[i]
            # Pad all the input token sentences to max_sequence
            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
            # Turn the output into one-hot encoding
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Add the corresponding image to the boostrap token file
            image_data.append(features[img_no])
            # Cap the input sentence to 48 tokens and add it
            X.append(in_seq[-48:])
            y.append(out_seq)
    return np.array(X), np.array(y), np.array(image_data)

#X, y, image_data = preprocess_data(train_sequences, train_features)

In [8]:
# X, y, image_data
# X.shape = (*,48)
# y.shape = (*,18)
# image_data.shape = (*,256,256,3)
# [image_data, X], y

class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, dir_name, list_IDs, max_length, tokenizer, batch_size=1, shuffle=True):
        'Initialization'
        self.dir_name = dir_name
        self.list_IDs = list_IDs
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.indexes = range(len(list_IDs))
        self.tokenizer = tokenizer
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)
        
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __load_data_batch(self, data_dir, list_IDs_temp):
        text = []
        images = []
        # Load files in batch
        for filename in (list_IDs_temp):
            image_file = filename + ".npz"
            # Load the images already prepared in arrays
            image = np.load(data_dir+image_file)
            images.append(image['features'])

            text_file = filename + ".gui"
            # Load the boostrap tokens and rap them in a start and end tag
            syntax = '<START> ' + load_doc(data_dir+text_file) + ' <END>'
            # Seperate all the words with a single space
            syntax = ' '.join(syntax.split())
            # Add a space after each comma
            syntax = syntax.replace(',', ' ,')
            text.append(syntax)
        images = np.array(images, dtype=float)
        return images, text

    def __preprocess_data(self, sequences, features):
        X, y, image_data = list(), list(), list()
        for img_no, seq in enumerate(sequences):
            for i in range(1, len(seq)):
                # Add the sentence until the current count(i) and add the current count to the output
                in_seq, out_seq = seq[:i], seq[i]
                # Pad all the input token sentences to max_sequence
                in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                # Turn the output into one-hot encoding
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # Add the corresponding image to the boostrap token file
                image_data.append(features[img_no])
                # Cap the input sentence to 48 tokens and add it
                X.append(in_seq[-48:])
                y.append(out_seq)
        return np.array(X), np.array(y), np.array(image_data)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        # Generate data
        train_features, texts = self.__load_data_batch(dir_name, list_IDs_temp)
        
        # Map the input sentences into the vocabulary indexes
        train_sequences = tokenizer.texts_to_sequences(texts)

        X, y, image_data = self.__preprocess_data(train_sequences, train_features)

        return [image_data, X], y

In [9]:
# Parameters
params = {'batch_size': 2,
          'max_length': max_length,
          'shuffle': True,
          'tokenizer': tokenizer}

# Datasets
# Gather list of files, store in partition
all_filenames = listdir(dir_name)
all_filenames.sort()
filenames = []
for filename in (all_filenames):
    if filename[-3:] == "npz":
        filenames.append(filename[:-4])

num_files = len(filenames)
train_size = int(num_files * 0.9)
val_size = num_files - train_size
train_idx = np.random.randint(num_files, size=train_size)
val_idx = np.random.randint(num_files, size=val_size)
train = [filenames[i] for i in train_idx]
val = [filenames[i] for i in val_idx]
partition = {}
partition['train'] = train
partition['validation'] = val

# Generators
training_generator = DataGenerator(dir_name, partition['train'], **params)
validation_generator = DataGenerator(dir_name, partition['validation'], **params)

In [10]:

############### Utility Functions to get models for transfer leraning (or just other architectures) ################


# can also do fine-tuning if you identify which layer you want to freeze from
# see: https://deeplearningsandbox.com/how-to-use-transfer-learning-and-fine-tuning-in-keras-and-tensorflow-to-build-an-image-recognition-94b0b02444f2
# TA suggested using aggresive learning rate with fine tuning ?


# get resnetV2 model applied on visual_input
# if pretrained, use imagenet weights
# if frozen, don't allow any layers to be trained
# use global avg pooling to reduce size of top layer!
def get_resnetV2(visual_input, debug=False, pretrained=True, frozen=True):
    
    if pretrained:
        base_model = InceptionResNetV2(weights = 'imagenet', include_top=False, input_shape = (256, 256, 3), pooling='avg', input_tensor=visual_input)
    else:
        base_model = InceptionResNetV2(weights = None, include_top=False, input_shape = (256, 256, 3), pooling='avg', input_tensor=visual_input)
    
    if frozen:
        for layer in base_model.layers:
            layer.trainable = False
        
    return base_model

# get inceptionV3 model
# visual_input = input tensor
# pretrained = whether to use imagenet weights or not
# frozen = whether or not to allow next layers to be trained
# pooling_type = whether or not to use global avg pooling ?

def get_inceptionV3(visual_input, debug=False, pretrained=True, frozen=True, pooling_type='avg'):
    # avg pooling converts shape MxNxC => 1xC
    if pretrained:
        base_model = InceptionV3(weights='imagenet', include_top=False, input_shape = (256, 256, 3), pooling=pooling_type, input_tensor=visual_input)
    else:
        # initialize to random weights
        base_model = InceptionV3(weights=None, include_top=False, input_shape = (256, 256, 3), pooling=pooling_type, input_tensor=visual_input)
    
    # make rest of network unable to train
    if frozen:
        for layer in base_model.layers:
           layer.trainable = False
    
    if debug:
        tot_layers = len(base_model.layers)
        print("Number of total layers: " + str(tot_layers))
        for i, layer in enumerate(base_model.layers):
           print(i, layer.name)
        
    return base_model 

# given a CNN network (base_model) modify it so that the layers after num_layers_to_train_cutoff can be trained
# but the layers before are frozen. 249 correponds to last two inception modules in inceptionV3
# could also try 101 for first 3 modules
def modify_model_for_finetuning(base_model, num_layers_to_train_cutoff=249):
    for layer in base_model.layers[:num_layers_to_train_cutoff]:
       layer.trainable = False
    for layer in base_model.layers[num_layers_to_train_cutoff:]:
       layer.trainable = True


# x = input image, base_model => type of underlying CNN model to use
# 1024 is way too big for image embedding ? Maybe shrink it down to 512 or 256 ? then expand it back out to 1024
def image_network_from_base_model(base_model,flatten=False):
    base_model_output = base_model.output
    
    print('output shape of base network is: ')
    print(base_model_output.shape)
    
    # need to flatten if you are NOT using global avg pooling on output of CNN
    if flatten:
        base_model_output = Flatten()(base_model_output)
        print('output shape of base model after flattening is: ')
        print(base_model_output.shape)
    
    # put a fully connected layer on top of the avg pooled last convolutional layer
    fc1 = Dense(1024)(base_model_output)
    # add dropout
    dropout_1 = Dropout(0.3)(fc1)
    # fc2
    fc2 = Dense(1024)(dropout_1)
    # add dropout
    final_output = Dropout(0.3)(fc2)
    
    # should probably move this out of this function to make it cleaner
    repeated_features = RepeatVector(max_length)(final_output)
    return repeated_features

# concats final output of base_cnn(pretrained and frozen) with the output of custom model
def image_network_concat_final_output(visual_input, base_model, custom_model, flatten=False):
    custom_model_output = custom_model(visual_input)
    
    if flatten:
        custom_model_output = Flatten()(custom_model_output)
    
    base_model_output = base_model.output 
    concat_output = concatenate([custom_model_output, base_model_output])
    
    # put two fully connected layers on top of concatenated output
    fc1 = Dense(1024)(concat_output)
    # add dropout
    dropout_1 = Dropout(0.3)(fc1)
    # fc2
    fc2 = Dense(1024)(dropout_1)
    # add dropout
    final_output = Dropout(0.3)(fc2)
    
    # should probably move this out of this function to make it cleaner
    repeated_features = RepeatVector(max_length)(final_output)
    return repeated_features

# concats output of custom CNN with output of some intermediate layers of (pretrained and frozen) base_model 
# and then does applies two fully connected layers
def image_network_concat_intermediate_features(visual_input, base_model, custom_model):
    custom_model_output = custom_model(visual_input)
    intermediate_feature_maps = get_intermediate_output_concat_model(base_model)
    all_visual_features = concatenate([custom_model_output, intermediate_feature_maps])
    
    # put two fully connected layers on top of concatenated output
    fc1 = Dense(1024)(all_visual_features)
    # add dropout
    dropout_1 = Dropout(0.3)(fc1)
    # fc2
    fc2 = Dense(1024)(dropout_1)
    # add dropout
    final_output = Dropout(0.3)(fc2)
    
    # should probably move this out of this function to make it cleaner
    repeated_features = RepeatVector(max_length)(final_output)
    return repeated_features
    
# create a new model whose output is just the concatenation of intermediate layers results
# need to use global average pooling instead of just flattening or else they would probably be way too large
def get_intermediate_output_concat_model(base_model):
    
    # extract some intermediate layers
    intermediate_output_first_conv = base_model.get_layer(index=64).output #intermediate_model.output
    intermediate_output_second_conv = base_model.get_layer(index=87).output #intermediate_model_2.output
    intermediate_output_third_conv = base_model.get_layer(index=101).output #intermediate_model_3.output
    
    # apply avg pooling
    pooled_first_layer = GlobalAveragePooling2D()(intermediate_output_first_conv)
    pooled_second_layer = GlobalAveragePooling2D()(intermediate_output_second_conv)
    pooled_third_layer = GlobalAveragePooling2D()(intermediate_output_third_conv)
    
    print('printing the shapes of intermediate layers before pooling')
    print(intermediate_output_first_conv.shape)
    print('printing shape after pooling..')
    print(pooled_first_layer.shape)

    merge_layer = concatenate([pooled_first_layer, pooled_second_layer, pooled_third_layer])
    return merge_layer


# the CNN architecture used in original bootstrap notebook
def create_custom_cnn_model():
    image_model = Sequential()
    image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(256, 256, 3,)))
    image_model.add(Conv2D(16, (3,3), activation='relu', padding='same', strides=2))
    image_model.add(Conv2D(32, (3,3), activation='relu', padding='same'))
    image_model.add(Conv2D(32, (3,3), activation='relu', padding='same', strides=2))
    image_model.add(Conv2D(64, (3,3), activation='relu', padding='same'))
    image_model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=2))
    # this was originally 128 in bootstrap code...can we make it smaller to reduce the overall # of params ?
    # especially when doing the concat stuff ?
    image_model.add(Conv2D(128, (3,3), activation='relu', padding='same'))
    # flatten the output after
    image_model.add(Flatten())
    return image_model
    


In [11]:
#Create the encoder

visual_input = Input(shape=(256, 256, 3,))
base_cnn_model = get_inceptionV3(visual_input, debug=False, pretrained=True, frozen=True, pooling_type='avg')
custom_cnn_model = create_custom_cnn_model()
# modify_model_for_finetuning(base_cnn_model, num_layers_to_train_cutoff=164)
encoded_image = image_network_concat_final_output(visual_input, base_cnn_model, custom_cnn_model, flatten=False)

language_input = Input(shape=(max_length,))
language_model = Embedding(vocab_size, 50, input_length=max_length, mask_zero=True)(language_input)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = LSTM(128, return_sequences=True)(language_model)

#Create the decoder
decoder = concatenate([encoded_image, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[visual_input, language_input], outputs=decoder)
# original lr is 0.0001, try 0.001 when it plataeus (10x increase)
optimizer = RMSprop(lr=0.0001, clipvalue=1.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 256, 256, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 127, 127, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 127, 127, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 127, 127, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_2 (

In [12]:
# serialize model to JSON
model_name = "emil-network-concat-final-layer-128"
model_json = model.to_json()
with open(model_name + ".json", "w") as json_file:
    json_file.write(model_json)

In [13]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate a description for an image
def generate_desc(loaded_model, tokenizer, photo, max_length):
    photo = np.array([photo])
    # seed the generation process
    in_text = '<START> '
    # iterate over the whole length of the sequence
    #print('\nPrediction---->\n\n<START> ', end='')
    for i in range(150):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = loaded_model.predict([photo, sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += word + ' '
        # stop if we predict the end of the sequence
        #print(word + ' ', end='')
        if word == '<END>':
            break
    return in_text

class BleuCallback(Callback):
    max_length = 48
    
    def __init__(self, texts, train_features, tokenizer):
        self.texts = texts
        self.train_features = train_features
        self.tokenizer = tokenizer
        return
        
    # evaluate the skill of the model
    def evaluate_model(self, model, texts, photos, tokenizer, max_length):
    #def evaluate_model(model, texts, photos, tokenizer, max_length):
        actual, predicted = list(), list()
        # step over the whole set
        for i in range(len(texts)):
            yhat = generate_desc(model, tokenizer, photos[i], max_length)
            # store actual and predicted
            #print('\n\nReal---->\n\n' + texts[i])
            actual.append([texts[i].split()])
            predicted.append(yhat.split())
        # calculate BLEU score
        bleu = corpus_bleu(actual, predicted)
        print('\nBleu Score: {}\n'.format(bleu))
        
    def on_epoch_end(self, epoch, logs={}):
        self.evaluate_model(self.model, self.texts, self.train_features, self.tokenizer, max_length)
        return

In [16]:
#Save the model for every 2nd epoch
filepath="emilnetwork-128-concat-InceptionV3-final-resumev3-lr-1e-3-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, period=2)
# Tensorboard
# board = TensorBoard(log_dir='./logs', histogram_freq=10, batch_size=10, write_graph=True, write_grads=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
# Bleu Score
#bleu = BleuCallback(texts, train_features, tokenizer)

In [17]:
# only used this to restore weights b/c my training crashed
# model.load_weights('emilnetwork-128-concat-InceptionV3-final-resumev2-lr-5e-4-0016--val_loss-0.0363--loss-0.0225.hdf5')

In [None]:
# Train the model
#callbacks_list = [checkpoint, board, bleu]
#history = model.fit([image_data, X], y, batch_size=10, shuffle=False, validation_split=0.1, callbacks=callbacks_list, verbose=1, epochs=2)


# if this doesn't work can also try keeping more of the earlier layers fixed but training the later ones...
# 164 works iwth batch size 6. 101 doesn't. We can maybe try 101 with batch size 4 ?

num_epochs_original = 20

callbacks_list = [checkpoint]
history = model.fit_generator(generator=training_generator,
                    validation_data=validation_generator, 
                    callbacks=callbacks_list,
                    use_multiprocessing=False,
                    workers=1,
                    shuffle=True,
                    verbose=1,
                    epochs=num_epochs_original)

# can we try batch_size = 4 ? Nope, doesn't work...
# what about batch_size = 3 ?

Epoch 1/20
Epoch 2/20

Epoch 00002: saving model to emilnetwork-128-concat-InceptionV3-final-resumev3-lr-1e-3-0002--val_loss-0.0136--loss-0.0252.hdf5
Epoch 3/20
Epoch 4/20

Epoch 00004: saving model to emilnetwork-128-concat-InceptionV3-final-resumev3-lr-1e-3-0004--val_loss-0.0119--loss-0.0180.hdf5
Epoch 5/20
Epoch 6/20

Epoch 00006: saving model to emilnetwork-128-concat-InceptionV3-final-resumev3-lr-1e-3-0006--val_loss-0.0135--loss-0.0143.hdf5
Epoch 7/20
Epoch 8/20

Epoch 00008: saving model to emilnetwork-128-concat-InceptionV3-final-resumev3-lr-1e-3-0008--val_loss-0.0107--loss-0.0132.hdf5
Epoch 9/20
Epoch 10/20

Epoch 00010: saving model to emilnetwork-128-concat-InceptionV3-final-resumev3-lr-1e-3-0010--val_loss-0.0116--loss-0.0121.hdf5
Epoch 11/20
Epoch 12/20

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['training loss', 'validation loss'], loc='upper left')
plt.show()

In [None]:
# Now, unfreeze the layers corresponding to last two inception modules and continue training the model...
filepath="fine-tuning-weights-lr-5e-3-droput-0.5-epoch-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, period=1)
callbacks_list = [checkpoint] 
num_epochs_to_finetune = 20

# modifies the existing cnn model to freeze lower layers but allow upper layers to be trained ?
modify_model_for_finetuning(base_cnn_model)

# use more aggresive learning rate ? original was 0.0001, increase by 5x
# I don't know...aggressive learning rate seems unstable ? Maybe just use way smaller one ? or smae ?
fine_tuning_optimizer = optimizer = RMSprop(lr=0.0005, clipvalue=1.0)

# load previous weights
model.load_weights("transferlearing-weights-epoch-0006--val_loss-0.1990--loss-0.1912.hdf5")
print('Loaded weights for transfer learning model!')


# recompile it
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

print(model.summary())
#print(model.weights)

In [None]:
# continue training with fine tuning ...
# validation loss seems to be diverging from training loss ? do we need to increase regularization ? for the fine tuning
# step ...
# could increase the dropout ? 
num_epochs_finetune = 20

callbacks_list = [checkpoint]
history = model.fit_generator(generator=training_generator,
                    validation_data=validation_generator, 
                    callbacks=callbacks_list,
                    use_multiprocessing=False,
                    workers=1,
                    shuffle=True,
                    verbose=1,
                    epochs=num_epochs_finetune)


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['training loss', 'validation loss'], loc='upper left')
plt.show()