In [1]:
print('start')

start


In [2]:
from os import listdir
# from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.models import Model, Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Flatten
# from keras.optimizers import RMSprop
# from keras.layers import Conv2D
# from keras.callbacks import ModelCheckpoint
# from keras.layers import Embedding, RepeatVector, LSTM, concatenate , Input, Dense
# from keras.preprocessing.image import array_to_img, img_to_array, load_img
import numpy as np




In [3]:
dir_name = '../resources/1190Data/'

# Read a file and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_data(data_dir):
    text = []
    images = []
    # Load all the files and order them
    all_filenames = listdir(data_dir)
    # all_filenames.sort()
    for filename in (all_filenames):
        if filename[-3:] == "npz":
            # Load the images already prepared in arrays
            image = np.load(data_dir+filename)
            images.append(image['features'])
        else:
            # Load the boostrap tokens and rap them in a start and end tag
            syntax = '<START> ' + load_doc(data_dir+filename) + ' <END>'
            # Seperate all the words with a single space
            syntax = ' '.join(syntax.split())
            # Add a space after each comma
            syntax = syntax.replace(',', ' ,')
            text.append(syntax)
    images = np.array(images, dtype=float)
    return images, text

train_features, texts = load_data(dir_name)

In [4]:
train_features.shape,len(texts)

((595, 128, 128, 3), 595)

In [5]:
# Initialize the function to create the vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)
# Create the vocabulary 
tokenizer.fit_on_texts([load_doc('../resources/bootstrap.vocab')])

# Add one spot for the empty word in the vocabulary 
vocab_size = len(tokenizer.word_index) + 1
# Map the input sentences into the vocabulary indexes
train_sequences = tokenizer.texts_to_sequences(texts)
# The longest set of boostrap tokens
max_sequence = max(len(s) for s in train_sequences)
# Specify how many tokens to have in each input sentence
max_length = 48

def preprocess_data(sequences, features):
    X, y, image_data = list(), list(), list()
    for img_no, seq in enumerate(sequences):
        for i in range(1, len(seq)):
            # Add the sentence until the current count(i) and add the current count to the output
            in_seq, out_seq = seq[:i], seq[i]
            # Pad all the input token sentences to max_sequence
            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
            # Turn the output into one-hot encoding
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Add the corresponding image to the boostrap token file
            image_data.append(features[img_no])
            # Cap the input sentence to 48 tokens and add it
            X.append(in_seq[-48:])
            y.append(out_seq)
    return np.array(X,dtype='float16'), np.array(y,dtype='float16'), np.array(image_data,dtype='float16')

X, y, image_data = preprocess_data(train_sequences, train_features)

In [6]:
vocab_size,max_length

(18, 48)

In [7]:
image_data.shape, X.shape, y.shape

((39690, 128, 128, 3), (39690, 48), (39690, 18))

In [8]:
from keras.models import Sequential, Model
from keras.layers import Conv2D, Flatten, Dense, Dropout, Input, LSTM, concatenate, RepeatVector, Embedding
from keras.optimizers import RMSprop

# # Define max_length and vocab_size
# max_length = 43
# vocab_size = 18

# Create the encoder
image_model = Sequential()
image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(128, 128, 3,)))
image_model.add(Conv2D(16, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(32, (3,3), activation='relu', padding='same'))
image_model.add(Conv2D(32, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(64, (3,3), activation='relu', padding='same'))
image_model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(128, (3,3), activation='relu', padding='same'))

image_model.add(Flatten())
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))

image_model.add(RepeatVector(max_length))

visual_input = Input(shape=(128, 128, 3,))
encoded_image = image_model(visual_input)

language_input = Input(shape=(max_length,))
language_model = Embedding(vocab_size, 50, input_length=max_length, mask_zero=True)(language_input)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = LSTM(128, return_sequences=True)(language_model)

# Create the decoder
decoder = concatenate([encoded_image, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[visual_input, language_input], outputs=decoder)
optimizer = RMSprop(learning_rate=0.0001, clipvalue=1.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)





## From Creator

In [9]:
"""FROM CREATOR"""
# #Create the encoder
# image_model = Sequential()
# image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(1,128, 128, 3,)))
# image_model.add(Conv2D(16, (3,3), activation='relu', padding='same', strides=2))
# image_model.add(Conv2D(32, (3,3), activation='relu', padding='same'))
# image_model.add(Conv2D(32, (3,3), activation='relu', padding='same', strides=2))
# image_model.add(Conv2D(64, (3,3), activation='relu', padding='same'))
# image_model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=2))
# image_model.add(Conv2D(128, (3,3), activation='relu', padding='same'))

# image_model.add(Flatten())
# image_model.add(Dense(1024, activation='relu'))
# image_model.add(Dropout(0.3))
# image_model.add(Dense(1024, activation='relu'))
# image_model.add(Dropout(0.3))

# image_model.add(RepeatVector(max_length))

# visual_input = Input(shape=(1,126, 126, 3,))
# encoded_image = image_model(visual_input)

# language_input = Input(shape=(max_length,))
# language_model = Embedding(vocab_size, 50, input_length=max_length, mask_zero=True)(language_input)
# language_model = LSTM(128, return_sequences=True)(language_model)
# language_model = LSTM(128, return_sequences=True)(language_model)

# #Create the decoder
# decoder = concatenate([encoded_image, language_model])
# decoder = LSTM(512, return_sequences=True)(decoder)
# decoder = LSTM(512, return_sequences=False)(decoder)
# decoder = Dense(vocab_size, activation='softmax')(decoder)

# # Compile the model
# model = Model(inputs=[visual_input, language_input], outputs=decoder)
# optimizer = RMSprop(learning_rate=0.0001, clipvalue=1.0)
# model.compile(loss='categorical_crossentropy', optimizer=optimizer)

'FROM CREATOR'

In [10]:
from keras.callbacks import ModelCheckpoint
#Save the model for every 2nd epoch
# filepath="org-weights-epoch-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, save_freq=2)
# callbacks_list = [checkpoint]


# Save the model for every 2nd epoch
# filepath="weights/org-weights-epoch-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, save_freq=2)
# callbacks_list = [checkpoint]

In [11]:
# Train the model
model.fit([image_data, X], y, batch_size=1, shuffle=False, validation_split=0.1, verbose=1, epochs=10)
model.save("model.h5")

Epoch 1/10

 6871/35721 [====>.........................] - ETA: 5:20:50 - loss: 2.3682