In [1]:
import os
import string
import numpy as np
import cv2
from pickle import dump, load
import matplotlib.pyplot as plt
%matplotlib inline

from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.layers import Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
curr_path = os.getcwd()
descriptions_file_dir = os.path.join(curr_path, "Descriptions/Flickr8k.token.txt")
train_images_dir = os.path.join(curr_path, "Descriptions/Flickr_8k.trainImages.txt")
test_images_dir = os.path.join(curr_path, "Descriptions/Flickr_8k.testImages.txt")
dev_images_dir = os.path.join(curr_path, "Descriptions/Flickr_8k.devImages.txt")
images_dir = os.path.join(curr_path, "Flicker8k_Dataset")

save_dir = os.path.join(curr_path, "preprocessed_data")
features_dict_path = os.path.join(save_dir, 'features_dict')
descriptions_dict_path = os.path.join(save_dir, 'descriptions_dict')
descriptions_text_path = os.path.join(save_dir, 'descriptions_text')

print("Image Directory: {}\nDescriptions Directory: {}".format(images_dir, descriptions_file_dir))

Image Directory: D:\Datasets\Flickr-8k\Flicker8k_Dataset
Descriptions Directory: D:\Datasets\Flickr-8k\Descriptions/Flickr8k.token.txt


In [5]:
def load_document(doc_path):
    file = open(doc_path, 'r')
    all_text = file.read()
    file.close()
    return all_text

def load_data(file_path):
    return load(open(file_path, 'rb'))

def load_image_ids(file_path):
    data = load_document(file_path)
    img_ids = []
    for line in data.split('\n'):
        if len(line) < 2:
            continue
        img_ids.append(line.split('.')[0])
    return img_ids

def load_train_image_ids():
    return load_image_ids(train_images_dir)

def load_dev_image_ids():
    return load_image_ids(dev_images_dir)

def load_image_descriptions(desc_dict_file_path, img_ids):
    descriptions_dict = load_data(desc_dict_file_path)
    desc_dict = dict()
    for id in img_ids:
        desc_dict[id] = descriptions_dict[id]
    return desc_dict

def load_train_image_descriptions(train_image_ids):
    return load_image_descriptions(descriptions_dict_path, train_image_ids)

def load_dev_image_descriptions(dev_image_ids):
    return load_image_descriptions(descriptions_dict_path, dev_image_ids)

def load_image_features(features_file_path, img_ids):
    features_dict = load_data(features_file_path)
    feat_dict = dict()
    for id in img_ids:
        feat_dict[id] = features_dict[id]
    return feat_dict

def load_train_image_features(train_image_ids):
    return load_image_features(features_dict_path, train_image_ids)

def load_dev_image_features(dev_image_ids):
    return load_image_features(features_dict_path, dev_image_ids)

In [10]:
train_image_ids = load_train_image_ids()
train_descriptions_dict = load_train_image_descriptions(train_image_ids)
train_features_dict = load_train_image_features(train_image_ids)

dev_image_ids = load_dev_image_ids()
dev_descriptions_dict = load_dev_image_descriptions(dev_image_ids)
dev_features_dict = load_dev_image_features(dev_image_ids)

print(len(train_image_ids))
print(len(train_descriptions_dict))
print(len(train_features_dict))
print()
print(len(dev_image_ids))
print(len(dev_descriptions_dict))
print(len(dev_features_dict))

6000
6000
6000

1000
1000
1000


In [11]:
def descriptions_to_list(descriptions_dict):
    desc_list = list()
    for key in descriptions_dict.keys():
        for desc in descriptions_dict[key]:
            desc_list.append(desc)
    return desc_list

def create_tokenizer(descriptions_list):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(descriptions_list)
    
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(desc.split()) for desc in descriptions_list)
    return tokenizer, vocab_size, max_length
    
def create_sequences(tokenizer, desc_list, feature, vocab_size, max_length):
    X1, X2, Y = list(), list(), list()
    
    for desc in desc_list:
        encoded_seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(encoded_seq)):
            inp_seq = encoded_seq[:i]
            out_seq = encoded_seq[i]

            inp_seq = pad_sequences([inp_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            X1.append(feature)
            X2.append(inp_seq)
            Y.append(out_seq)
                
    return np.array(X1), np.array(X2), np.array(Y)

In [12]:
train_descriptions_list = descriptions_to_list(train_descriptions_dict)
train_tokenizer, vocab_size, max_length = create_tokenizer(train_descriptions_list)
# test_tokenizer, _, _ = create_tokenizer(test_img_desc)
# X1, X2, Y = create_sequences(tokenizer, train_img_desc, train_img_features, vocab_size, max_length)

In [13]:
len(train_descriptions_list)

30000

In [17]:
print(train_tokenizer.texts_to_sequences([train_descriptions_list[0]]))
print(train_descriptions_list[0])

[[4, 1, 2, 16, 10, 8, 33, 254, 2, 15, 10, 5, 6, 43, 3, 1]]
start_seq a black dog is running after a white dog in the snow end_seq


In [19]:
print(train_tokenizer.texts_to_sequences([train_descriptions_list[10]]))
print(train_descriptions_list[10])

[[4, 1, 2, 26, 10, 5, 6, 43, 108, 174, 868, 89, 5, 76, 68, 3, 1]]
start_seq a brown dog in the snow has something hot pink in its mouth end_seq


In [24]:
encoded_seq = train_tokenizer.texts_to_sequences([train_descriptions_list[0]])[0]
for i in range(1, len(encoded_seq)):
    inp_seq = encoded_seq[:i]
    out_seq = encoded_seq[i]
    
    print(inp_seq)
    print(out_seq)
    inp_seq = pad_sequences([inp_seq], maxlen=max_length)[0]
    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
    print(inp_seq)
    print(out_seq)
    print()

#     inp_seq = pad_sequences([inp_seq], maxlen=max_length)[0]
#     out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

#     X1.append(feature)
#     X2.append(inp_seq)
#     Y.append(out_seq)

[4]
1
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4]
[0. 1. 0. ... 0. 0. 0.]

[4, 1]
2
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 1]
[0. 0. 1. ... 0. 0. 0.]

[4, 1, 2]
16
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 1 2]
[0. 0. 0. ... 0. 0. 0.]

[4, 1, 2, 16]
10
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  4  1  2 16]
[0. 0. 0. ... 0. 0. 0.]

[4, 1, 2, 16, 10]
8
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  4  1  2 16 10]
[0. 0. 0. ... 0. 0. 0.]

[4, 1, 2, 16, 10, 8]
33
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  4  1  2 16 10  8]
[0. 0. 0. ... 0. 0. 0.]

[4, 1, 2, 16, 10, 8, 33]
254
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  4  1  2 16 10  8 33]
[0. 0. 0. ... 0. 0. 0.]

[4, 1, 2, 16, 10, 8, 33, 254]
2
[  0   0 