In [1]:
import os
import string
import numpy as np
import cv2
from pickle import dump, load
import matplotlib.pyplot as plt
%matplotlib inline

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
curr_path = os.getcwd()
descriptions_file_dir = os.path.join(curr_path, "Descriptions/Flickr8k.token.txt")
train_images_dir = os.path.join(curr_path, "Descriptions/Flickr_8k.trainImages.txt")
test_images_dir = os.path.join(curr_path, "Descriptions/Flickr_8k.testImages.txt")
dev_images_dir = os.path.join(curr_path, "Descriptions/Flickr_8k.devImages.txt")
images_dir = os.path.join(curr_path, "Flicker8k_Dataset")

save_dir = os.path.join(curr_path, "preprocessed_data")
features_dict_path = os.path.join(save_dir, 'features_dict')
descriptions_dict_path = os.path.join(save_dir, 'descriptions_dict')
descriptions_text_path = os.path.join(save_dir, 'descriptions_text')

print("Image Directory: {}\nDescriptions Directory: {}".format(images_dir, descriptions_file_dir))

Image Directory: D:\Datasets\Flickr-8k\Flicker8k_Dataset
Descriptions Directory: D:\Datasets\Flickr-8k\Descriptions/Flickr8k.token.txt


In [3]:
def load_document(doc_path):
    file = open(doc_path, 'r')
    all_text = file.read()
    file.close()
    return all_text

def load_data(file_path):
    return load(open(file_path, 'rb'))

def load_image_ids(file_path):
    data = load_document(file_path)
    img_ids = []
    for line in data.split('\n'):
        if len(line) < 2:
            continue
        img_ids.append(line.split('.')[0])
    return img_ids

def load_train_image_ids():
    return load_image_ids(train_images_dir)

def load_dev_image_ids():
    return load_image_ids(dev_images_dir)

def load_image_descriptions(desc_dict_file_path, img_ids):
    descriptions_dict = load_data(desc_dict_file_path)
    desc_dict = dict()
    for id in img_ids:
        desc_dict[id] = descriptions_dict[id]
    return desc_dict

def load_train_image_descriptions(train_image_ids):
    return load_image_descriptions(descriptions_dict_path, train_image_ids)

def load_dev_image_descriptions(dev_image_ids):
    return load_image_descriptions(descriptions_dict_path, dev_image_ids)

def load_image_features(features_file_path, img_ids):
    features_dict = load_data(features_file_path)
    feat_dict = dict()
    for id in img_ids:
        feat_dict[id] = features_dict[id]
    return feat_dict

def load_train_image_features(train_image_ids):
    return load_image_features(features_dict_path, train_image_ids)

def load_dev_image_features(dev_image_ids):
    return load_image_features(features_dict_path, dev_image_ids)

In [4]:
train_image_ids = load_train_image_ids()
train_descriptions_dict = load_train_image_descriptions(train_image_ids)
train_features_dict = load_train_image_features(train_image_ids)

dev_image_ids = load_dev_image_ids()
dev_descriptions_dict = load_dev_image_descriptions(dev_image_ids)
dev_features_dict = load_dev_image_features(dev_image_ids)

print(len(train_image_ids))
print(len(train_descriptions_dict))
print(len(train_features_dict))
print()
print(len(dev_image_ids))
print(len(dev_descriptions_dict))
print(len(dev_features_dict))

6000
6000
6000

1000
1000
1000


In [5]:
def descriptions_to_list(descriptions_dict):
    desc_list = list()
    for key in descriptions_dict.keys():
        for desc in descriptions_dict[key]:
            desc_list.append(desc)
    return desc_list

def create_tokenizer(descriptions_list):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(descriptions_list)
    
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(desc.split()) for desc in descriptions_list)
    return tokenizer, vocab_size, max_length

In [6]:
train_descriptions_list = descriptions_to_list(train_descriptions_dict)
train_tokenizer, vocab_size, max_length = create_tokenizer(train_descriptions_list)
# test_tokenizer, _, _ = create_tokenizer(test_img_desc)
# X1, X2, Y = create_sequences(tokenizer, train_img_desc, train_img_features, vocab_size, max_length)

In [7]:
def model(max_length, vocab_size):
    input1 = Input(shape=(1024,))
    x1 = Dropout(0.25)(input1)
    x1 = Dense(units=256, activation='relu')(x1)
    
    input2 = Input(shape=(max_length,))
    x2 = Embedding(vocab_size, 256, mask_zero=True)(input2)
    x2 = Dropout(0.5)(x2)
    x2 = LSTM(256)(x2)
    
    d = add([x1, x2])
    d = Dense(256, activation='relu')(d)
    
    output = Dense(units=vocab_size, activation='softmax')(d)
    
    model = Model(inputs=[input1, input2], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

In [8]:
model = model(max_length, vocab_size)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 36)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1024)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 36, 256)      1862656     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1024)         0           input_1[0][0]                    
__________________________________________________________________________________________________
dropout_2 

In [9]:
def create_sequences(tokenizer, descriptions, features, vocab_size, max_length):
    X1, X2, Y = np.zeros((1, 1024)), np.zeros((1, max_length)), np.zeros((1, vocab_size))

    for j, desc_list in enumerate(descriptions):
        for desc in desc_list:
            encoded_seq = tokenizer.texts_to_sequences([desc])[0]
            for i in range(1, len(encoded_seq)):
                inp_seq = encoded_seq[:i]
                out_seq = encoded_seq[i]

                inp_seq = pad_sequences([inp_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                X1 = np.vstack((X1, features[j]))
                X2 = np.vstack((X2, inp_seq))
                Y = np.vstack((Y, out_seq))
                
    return X1[1:], X2[1:], Y[1:]

In [10]:
def data_generator(tokenizer, train_descriptions_dict, train_features_dict, vocab_size, max_length, steps_per_epoch):
    batch_size = len(train_descriptions_dict) // steps_per_epoch
    x1 = []
    y1 = []
    features = []
    descriptions = []
    i = 1
    while 1:
        for key, desc_list in train_descriptions_dict.items():
            features.append(train_features_dict[key][0])
            descriptions.append(desc_list)
            
            if i == batch_size:
                input1, input2, output = create_sequences(tokenizer, descriptions, features, vocab_size, max_length)
                features = []
                descriptions = []
                i = 1
                yield([input1, input2], output)
            i = i + 1

In [11]:
steps_per_epoch = 6000
generator = data_generator(train_tokenizer, train_descriptions_dict, train_features_dict, vocab_size, max_length, steps_per_epoch)

In [12]:
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(60, 1024)
(60, 36)
(60, 7276)


In [14]:
# filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

batch_size = 2
steps_per_epoch = len(train_descriptions_dict) // batch_size

generator = data_generator(train_tokenizer, train_descriptions_dict, train_features_dict, vocab_size, max_length, steps_per_epoch)
model.fit_generator(generator, epochs=10, verbose=1, steps_per_epoch=steps_per_epoch)

Epoch 1/10

MemoryError: 