Deep Learning Personal Project: Guide Bot

Goal of this project: To build "Guide Bot" which can be connected to cameras (possibly of cell phones) to interpret the surrounding (camera input) and provide vocal information using text-to-speech conversion. Could be an aid for the blind.

About the original dataset:

Name: Flickr 30k Data
One folder of 30k images and a csv file of corresponding captions of the images
Acknowledgement -- this dataset is taken from University of Illinois at Urbana-Champaign Department of Computer Science (https://forms.illinois.edu/sec/229675)

Acknowledgement for Pretrained Model Used
1) Inception V3 (from keras.applications)
2) Glove: Global Vectors for Word Representation
    by Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014.
    source from https://nlp.stanford.edu/projects/glove/

Table of Contents:

In [None]:
# Make necessary folders for the data
import os

data_dir = './data_folder'
os.mkdir(data_dir)
train_dir = os.path.join(data_dir, 'train')
os.mkdir(train_dir)
validation_dir = os.path.join(data_dir, 'validation')
os.mkdir(validation_dir)
test_dir = os.path.join(data_dir, 'test')
os.mkdir(test_dir)

In [None]:
import shutil
original_data_dir = './flickr30k_images/flickr30k_images'
count = 0
all_imgs = set()
train_imgs = set()
val_imgs = set()
test_imgs = set()
for img in os.listdir(original_data_dir):
    if (count == 16000):
        break
    if (count < 12000):
        src = os.path.join(original_data_dir, img)
        dst = os.path.join(train_dir, img)
        shutil.copyfile(src, dst)
        all_imgs.add(img)
        train_imgs.add(img)
        count += 1
    elif (count < 14000):
        src = os.path.join(original_data_dir, img)
        dst = os.path.join(validation_dir, img)
        shutil.copyfile(src, dst)
        all_imgs.add(img)
        val_imgs.add(img)
        count += 1
    else :
        src = os.path.join(original_data_dir, img)
        dst = os.path.join(test_dir, img)
        shutil.copyfile(src, dst)
        all_imgs.add(img)
        test_imgs.add(img)
        count += 1

In [183]:
print('Trainset images : ', len(os.listdir(train_dir)))
print('Validationset images : ', len(os.listdir(validation_dir)))
print('Testset images : ', len(os.listdir(test_dir)))
# ".DS_Store" was also counted in all of the folders (therefore got 12000/2000/2000 images)
print(len(all_imgs))
print(len(train_imgs))
print(len(val_imgs))
print(len(test_imgs))

('Trainset images : ', 12002)
('Validationset images : ', 2002)
('Testset images : ', 2001)
16000
12000
2000
2000


In [None]:
# Convert csv into txt with only those that matter (that is, 16k imgs & 80k captions out of 30k & 150k)
csv_path = './flickr30k_images/results.csv'
with open(csv_path, "r") as csv_file:
    lines = [line.split("|") for line in csv_file.readlines()]

In [None]:
# Convert csv into txt with only those that matter (that is, 16k imgs & 80k captions out of 30k & 150k)
txt_file = open('./data_folder/all_captions.txt',"w")

for line in lines:
    img_name = line[0].strip()
    if img_name in all_imgs:
        txt_file.write(img_name + "#" + str(line[1]).strip() + " " + line[2])

In [184]:
count = 0
for line in open('./data_folder/all_captions.txt',"r"):
    count += 1
print(count)
# 286 captions are missing, but will ignore them and proceed to next steps

79714


In [185]:
# Create a dictionary "imgs_and_captions" where key : name_of_images and value : corresponding_captions
all_captions = open('./data_folder/all_captions.txt',"r").read()
imgs_and_captions = dict()
for line in all_captions.split('\n'):
    l = line.split(' ')
    img_name = l[0].split('.')[0]
    corresponding_caption = ' '.join(l[1:])
    if img_name not in imgs_and_captions:
        imgs_and_captions[img_name] = list()
    imgs_and_captions[img_name].append(corresponding_caption)

In [186]:
print(len(imgs_and_captions))

16000


In [187]:
print(imgs_and_captions.keys()[12345])

2814037463


In [188]:
print(imgs_and_captions['2814037463'])
print("number of captions: " + str(len(imgs_and_captions['2814037463'])))

[' On the left , a hand points to a surprised looking woman with short hair sitting at a table in front of a window,,,,,,,,\r', ' A woman sits at a table near a window , with a plate and glass in front of her .,,,,,,,,\r', ' A woman wearing a floral dress sits as a finger is pointed at her .,,,,,,,,,\r', ' A woman sitting down to have a meal is being pointed at .,,,,,,,,,\r', ' A woman has a finger pointing in her face .,,,,,,,,,\r']
number of captions: 5


In [189]:
print(imgs_and_captions.keys()[36])

3030015033


In [190]:
print(imgs_and_captions['2860314714'])
print("number of captions: " + str(len(imgs_and_captions['2860314714'])))

[' This gentleman is standing outside trying to make something he has his tools on this makeshift table and a couple of men are watching him .,,,,,,,,,\r', ' A man is doing some work outdoors that requires some tools , like a mallet and chisel , while another man looks on smiling .,,,,,,,\r', ' A man in a red pullover is creating a work of art on a cement block in the city square while onlookers gleefully watch .,,,,,,,,,\r', ' A man in a red shirt works on a white sculpture .,,,,,,,,,\r', ' An artist crating some stone art for tourist .,,,,,,,,,\r']
number of captions: 5


In [191]:
# Clean each caption in the dictionary "imgs_and_captions"
import string
# import nltk
# from nltk.corpus import stopwords
#nltk.download('stopwords')
for key, captions in imgs_and_captions.items():
    for i in range(len(captions)):
        tokens = captions[i].split() #split into words
        tokens = [word.translate(None, string.punctuation) for word in tokens] #remove punctuations
        tokens = [word for word in tokens if word.isalpha()] #remove non-alphabetics
        tokens = [word.strip() for word in tokens] #remove trailing whitespaces
        tokens = [word.lower() for word in tokens] #convert to lower-case 
#         tokens = [word for word in tokens if word not in set (stopwords.words('english'))] #remove stopwords        
        captions[i] = ' '.join(tokens)

In [192]:
print(imgs_and_captions['2814037463'])
print("number of captions: " + str(len(imgs_and_captions['2814037463'])))

['on the left a hand points to a surprised looking woman with short hair sitting at a table in front of a window', 'a woman sits at a table near a window with a plate and glass in front of her', 'a woman wearing a floral dress sits as a finger is pointed at her', 'a woman sitting down to have a meal is being pointed at', 'a woman has a finger pointing in her face']
number of captions: 5


In [193]:
print(imgs_and_captions['2860314714'])
print("number of captions: " + str(len(imgs_and_captions['2860314714'])))

['this gentleman is standing outside trying to make something he has his tools on this makeshift table and a couple of men are watching him', 'a man is doing some work outdoors that requires some tools like a mallet and chisel while another man looks on smiling', 'a man in a red pullover is creating a work of art on a cement block in the city square while onlookers gleefully watch', 'a man in a red shirt works on a white sculpture', 'an artist crating some stone art for tourist']
number of captions: 5


In [194]:
print(imgs_and_captions['2514612680'])
print("number of captions: " + str(len(imgs_and_captions['2514612680'])))

['people walking through a short tunnel with where are you written on the wall', 'a man walks under a bridge and reads grafitti that reads where are you', 'a man with a cigarette walks past graffitti which says where are you', 'a man is walking next to a wall with where are you painted on it', 'a man walking reads a wall asking where are you']
number of captions: 5


In [195]:
txt_file = open('./data_folder/cleaned_captions.txt',"w")
for img, captions in imgs_and_captions.items():
    for caption in captions:
        txt_file.write(img + " " + caption + '\n')
txt_file.close()

In [198]:
print(len(all_imgs))
print(len(train_imgs))
print(len(val_imgs))
print(len(test_imgs))

16000
12000
2000
2000


In [199]:
# A list of names of imgs in the trainset (without .jpg)
train_img_names = []
for img in train_imgs:
    train_img_names.append(img.split('.')[0])
print(len(train_img_names))

12000


In [200]:
# Create captions for the training set
# Dictionary "training_captions" where key : name of image (without jpg extension) value : corresponding captions
cleaned_captions = open('./data_folder/cleaned_captions.txt',"r").read()
training_captions = dict()
for line in cleaned_captions.split('\n'):
    tokens = line.split()
    if (len(tokens) < 2):
        continue
    img, caption = tokens[0], tokens[1:]
    if img in train_img_names:
        if img not in training_captions:
            training_captions[img] = list()
        words = 'startseq ' + ' '.join(caption) + ' endseq'
        training_captions[img].append(words)

In [201]:
print(len(training_captions))
print(training_captions.keys()[36])
print(training_captions['3178005751'])

12000
3178005751
['startseq a man and woman stand next two each other in front of a wood fence endseq', 'startseq two people posing for the picture with a few trees in the background endseq', 'startseq a boy and girl both in jeans are standing in front of a fence endseq', 'startseq a girl in gray and a guy in stripes pose against a fence endseq', 'startseq smiling woman with man in striped sweatshir in the park endseq']


In [202]:
print(training_captions.keys()[6789])
print(training_captions['12974441'])

12974441
['startseq young man with glasses and two small pigtails on front of his hair with bloody face appears to be in emergency room waiting area endseq', 'startseq an injured bloody person sitting with friends in a hospital waiting room endseq', 'startseq a gentleman in a waiting room of a hospital with burns to his face endseq', 'startseq a man who has been beaten up or has a bad rash on his face endseq', 'startseq person with wounds on their face in a waiting room endseq']


In [None]:
# Create captions for the validation set too?

In [None]:
# Load Inception V3 model and remove its last layer
from keras.applications.inception_v3 import InceptionV3
model = InceptionV3(weights='imagenet')

In [None]:
from keras.models import Model
model_v3_without_output_layer = Model(model.input, model.layers[-2].output)

In [None]:
# Reference from: https://github.com/hlamba28/Automatic-Image-Captioning
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing import image
import numpy as np

def preprocess(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_path, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_v3_without_output_layer.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [None]:
# Automated feature engineering
# "not to classify the image but just get fixed-length informative vector for each image"
# Dictionary "train_imgs_encoded" where key : img_name value : feature_vector_of_size_(2048, )
train_imgs_encoded = dict()
for img in os.listdir(train_dir):
    if img == '.DS_Store':
        continue
    img_name = img.split('.')[0]
    img_path = os.path.join(train_dir, img)
    train_imgs_encoded[img_name] = encode(img_path)

In [None]:
# Save the bottleneck train images features
import pickle
path_for_training = os.path.join(train_dir, 'Pickle')
os.mkdir(path_for_training)
path_for_training = os.path.join(path_for_training, 'train_imgs_encoded.pkl')
with open(path_for_training,'wb') as encoded_pickle:
    pickle.dump(train_imgs_encoded, encoded_pickle)

In [None]:
# Automated feature engineering
# "not to classify the image but just get fixed-length informative vector for each image"
# Dictionary "validation_imgs_encoded" where key : img_name value : feature_vector_of_size_(2048, )
validation_imgs_encoded = dict()
count = 1
for img in os.listdir(validation_dir):
    if img == '.DS_Store':
        continue
    img_name = img.split('.')[0]
    img_path = os.path.join(validation_dir, img)
    print("" + str(count) + " / 2000 encoding...")
    validation_imgs_encoded[img_name] = encode(img_path)
    count += 1

In [None]:
# Save the bottleneck validation images features
path_for_validation = os.path.join(validation_dir, 'Pickle')
os.mkdir(path_for_validation)
path_for_validation = os.path.join(path_for_validation, 'validation_imgs_encoded.pkl')
with open(path_for_validation,'wb') as encoded_pickle:
    pickle.dump(validation_imgs_encoded, encoded_pickle)

In [None]:
path_for_validation = os.path.join(validation_dir, 'Pickle')
train_img_features = pickle.load(open(path_for_training,'rb'))
validation_img_features = pickle.load(open(path_for_validation,'rb'))
print('Train img features = %d' % len(train_img_features))
print('Validation img features = %d' % len(validation_img_features))

In [203]:
# Create a training vocabulary with only words that occurs at least 10 times
word_freq = dict()
for list_of_cap in training_captions.values():
    for caption in list_of_cap:
        tokens = caption.split()
        for token in tokens:
            if token not in word_freq:
                word_freq[token] = 1
            else :
                count = word_freq[token] + 1
                word_freq[token] = count
training_vocabulary = [word for word in word_freq if word_freq[word] >= 10]
print('total number of words : %d & number of vocabs interested : %d' % (len(word_freq), len(training_vocabulary)))

total number of words : 12604 & number of vocabs interested : 3093


In [204]:
# Create two dictionaries for easy conversion from index to word and from word to index
index_to_word = dict()
word_to_index = dict()
index = 1
for word in training_vocabulary:
    index_to_word[index] = word
    word_to_index[word] = index
    index += 1

In [205]:
print(len(training_vocabulary) == len(index_to_word))
print(len(training_vocabulary) == len(word_to_index))

True
True


In [206]:
# Calculate the maximum length of all training captions
max_len_of_all_cap = 0
for captions in training_captions.values():
    for i in range(len(captions)):
        max_len_of_all_cap = max(len(captions[i].split()), max_len_of_all_cap)
print("Max length of all captions : %d" % max_len_of_all_cap)

Max length of all captions : 80


In [207]:
# Reference from: https://github.com/hlamba28/Automatic-Image-Captioning

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def data_generator(train_captions, train_features, wordtoix, max_length, vocab_size, num_imgs_per_batch):
    X1, X2, y = list(), list(), list()
    count = 0
    # loop for ever over images
    while True:
        for img, captions in train_captions.items():
            count += 1
            feature = train_features[img]
            for caption in captions:
                # encode the sequence
                seq = [wordtoix[word] for word in caption.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if count == num_imgs_per_batch:
                yield [[np.asarray(X1), np.asarray(X2)], np.asarray(y)]
                X1, X2, y = list(), list(), list()
                count = 0

In [None]:
# Upload glove txt and create dictionary "word_to_embedding_vectors" where key is word and value is embedding_vector
import io
glove_dir = './data_folder/glove'

word_to_embedding_vectors = dict()

glove_file = io.open(os.path.join(glove_dir, 'glove.6B.200d.txt'), mode="r", encoding="utf-8")

for line in glove_file:
    tokens = line.split()
    word = tokens[0]
    embedding_vector = np.asarray(tokens[1:], dtype='float32')
    word_to_embedding_vectors[word] = embedding_vector
    
glove_file.close()

https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html
Higher-dimensional embeddings can more accurately represent the relationships between input values.
But more dimensions increases the chance of overfitting and leads to slower training.
Empirical rule of thumb (a good starting point but should be tuned using the validation data) : embedding_dimensions =  number_of_categories**0.25

But since dimension of embedding_vectors above is 200, I'll stick to 200 for this project.

In [208]:
print("dimension of embedding_vector: " + str(len(word_to_embedding_vectors['a'])))

dimension of embedding_vector: 200


In [209]:
# Create embedding_matrix which has key : 
embedding_dim = len(word_to_embedding_vectors.values()[0])
vocabulary_size = len(training_vocabulary) + 1
embedding_matrix = np.zeros((vocabulary_size,embedding_dim))

# For words in our training vocabulary, extract embedding_vectors from glove if exist
for word, index in word_to_index.items():
    if word in word_to_embedding_vectors:
        embedding_matrix[index] = word_to_embedding_vectors[word]

Since order of words in captions are not important interpreting their meanings, I'm going to use conv1D instead of RNN/LSTM/GRU. If order was important usually as in problems involving time-series data, would have used RNN/LSTM. But this case, where involving text data, conv1d can be used for their lightness with almost the same performance.

In [None]:
# Build a model
from keras import Input, layers, Model

input1 = Input(shape=(2048,))
x1 = layers.Dense(256, activation='relu')(input1)
x1 = layers.Dropout(0.5)(x1)

input2 = Input(shape=(max_len_of_all_cap,))
x2 = layers.Embedding(vocabulary_size, embedding_dim)(input2)
x2 = layers.Conv1D(128, 7, activation='relu')(x2)
x2 = layers.MaxPooling1D(5)(x2)
x2 = layers.Conv1D(256, 7, activation='relu')(x2)
x2 = layers.GlobalMaxPooling1D()(x2)

input_added = layers.add([x1, x2])
x3 = layers.Dense(256, activation='relu')(input_added)
output = layers.Dense(vocabulary_size, activation='softmax')(x3)

model = Model(inputs=[input1, input2], outputs=output)

In [None]:
model.summary()

In [None]:
model.layers[1]

In [None]:
# Freeze embedding layer
model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

In [None]:
from keras import optimizers
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.0001))

In [218]:
train_imgs_encoded = pickle.load(open('./data_folder/train/Pickle/train_imgs_encoded.pkl','rb'))
validation_imgs_encoded = pickle.load(open('./data_folder/validation/Pickle/validation_imgs_encoded.pkl','rb'))

In [None]:
epochs = 10
batch_size = 3
steps = len(training_captions) // batch_size

In [None]:
for i in range(epochs):
    generator = data_generator(training_captions,
                               train_imgs_encoded,
                               word_to_index,
                               max_len_of_all_cap,
                               vocabulary_size,
                               batch_size)
    history = model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./models/model_' + str(i) + '.h5')

In [None]:
model = models.load_model('./models/model_9.h5')

In [None]:
#make some changes
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4))

In [None]:
epochs = 10
batch_size = 3
steps = len(training_captions) // batch_size

In [None]:
for i in range(epochs):
    generator = data_generator(training_captions,
                               train_imgs_encoded,
                               word_to_index,
                               max_len_of_all_cap,
                               vocabulary_size,
                               batch_size)
    history = model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./models/model_1' + str(i) + '.h5')

In [None]:
model = models.load_model('./models/model_19.h5')

In [None]:
#make some changes
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-5))

In [None]:
epochs = 10
batch_size = 3
steps = len(training_captions) // batch_size

In [None]:
for i in range(epochs):
    generator = data_generator(training_captions,
                               train_imgs_encoded,
                               word_to_index,
                               max_len_of_all_cap,
                               vocabulary_size,
                               batch_size)
    history = model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./models/model_2' + str(i) + '.h5')

In [212]:
# LSTM model Reference from: https://github.com/hlamba28/Automatic-Image-Captioning
input1 = Input(shape=(2048,))
x1 = layers.Dropout(0.5)(input1)
x1 = layers.Dense(512, activation='relu')(x1)

input2 = Input(shape=(max_len_of_all_cap,))
x2 = layers.Embedding(vocabulary_size, embedding_dim, mask_zero=True)(input2)
x2 = layers.Dropout(0.5)(x2)
x2 = layers.LSTM(512)(x2)

input_added = layers.add([x1, x2])
x3 = layers.Dense(512, activation='relu')(input_added)
output = layers.Dense(vocabulary_size, activation='softmax')(x3)
model = Model(inputs=[input1, input2], outputs=output)

In [213]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_62 (InputLayer)           (None, 80)           0                                            
__________________________________________________________________________________________________
input_61 (InputLayer)           (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_29 (Embedding)        (None, 80, 200)      618800      input_62[0][0]                   
__________________________________________________________________________________________________
dropout_42 (Dropout)            (None, 2048)         0           input_61[0][0]                   
__________________________________________________________________________________________________
dropout_43

In [214]:
model.layers[2]

<keras.layers.embeddings.Embedding at 0x170d356d0>

In [215]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [216]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [217]:
epochs = 10
batch_size = 3
steps = len(training_captions) // batch_size

In [219]:
for i in range(epochs):
    generator = data_generator(training_captions,
                               train_imgs_encoded,
                               word_to_index,
                               max_len_of_all_cap,
                               vocabulary_size,
                               batch_size)
    history = model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./models/lstm_model_' + str(i) + '.h5')

Epoch 1/1
   8/4000 [..............................] - ETA: 2:47:26 - loss: 6.9724

KeyboardInterrupt: 

In [None]:
model.load_weights('./models/lstm/lstm_model_9.h5')

In [None]:
validation_images_path = './data_folder/validation/'

In [None]:
with open('./data_folder/validation/Pickle/validation_imgs_encoded.pkl', 'rb') as encoded_pickle:
    validation_imgs_encoded = pickle.load(encoded_pickle)

In [None]:
# Reference from: https://github.com/hlamba28/Automatic-Image-Captioning
def greedySearch(feature):
    in_text = 'startseq'
    for i in range(max_len_of_all_cap):
        inputs = [word_to_index[w] for w in in_text.split() if w in word_to_index]
        #
        inputs = pad_sequences([inputs], maxlen=max_len_of_all_cap)

        y_hat = model.predict([feature, inputs], verbose=0)
        y_hat = np.argmax(y_hat)
        word = index_to_word[y_hat]
        
        in_text += ' ' + word
        if word == 'endseq':
            break
    predicted_caption = in_text.split()
    # remove 'startseq' & 'endseq'
    predicted_caption = predicted_caption[1:-1]
    predicted_caption = ' '.join(predicted_caption)
    return predicted_caption

In [None]:
# Reference from: https://github.com/hlamba28/Automatic-Image-Captioning
from random import randint

index = randint(0,1999)
img_name = list(validation_imgs_encoded.keys())[index]
feature = validation_imgs_encoded[img_name].reshape((1,2048))
x = plt.imread(validation_images_path + img_name + '.jpg')
plt.imshow(x)
plt.show()
print("Greedy:",greedySearch(feature))