In [59]:
import numpy as np
import pandas as pd
from collections import Counter
import re


In [60]:
caption_array = np.load('image_to_caption.npy', allow_pickle=True)
caption_dict = dict(caption_array)
print(caption_dict['COCO_train2014_000000529411.jpg'])


A white dog laying in the grass playing with a stuffed animal.


In [35]:
PAD = "#PAD#"
UNK = "#UNK#"
START = "#START#"
END = "#END#"


In [36]:
def split_sentence(sentence):
    return sentence.lower().replace('.', '').replace(',', '').split()


def generate_vocabulary(file_name_to_longest_caption):

    all_captions = list(file_name_to_longest_caption.values())
    token_list = [
        token for caption in all_captions for token in split_sentence(caption)]
    counter_map = Counter(token_list)
    vocab = [token for token, count in counter_map.items() if count >= 5]
    vocab += [PAD, UNK, START, END]
    vocab_dict = {token: index for index, token in enumerate(sorted(vocab))}
    return vocab_dict


def caption_tokens_to_indices(file_name_to_longest_caption, vocab):
    res = []

    captions = list(file_name_to_longest_caption.values())

    for sentence in captions:
        new_sentence = []

        new_sentence.append(vocab.get(START, vocab[UNK]))


        for token in split_sentence(sentence):
            new_sentence.append(vocab.get(token, vocab[UNK]))

        new_sentence.append(vocab.get(END, vocab[UNK]))

        res.append(new_sentence)

    return res


def pad_captions(captions_indices, vocab, max_length=20, pad_token='<PAD>'):

    
    pad_index = vocab.get(pad_token, 0)

    padded_captions = []

    for caption in captions_indices:
    
        if len(caption) > max_length:
            caption = caption[:max_length]
       
        elif len(caption) < max_length:
            caption = caption + [pad_index] * (max_length - len(caption))
        padded_captions.append(caption)


    return np.array(padded_captions)


def map_padded_indices_to_file_names(padded_indices, file_name_to_longest_caption):
    
    image_names = list(file_name_to_longest_caption.keys())
    padded_caption_map = dict(zip(image_names, padded_indices.tolist()))
    return padded_caption_map


In [37]:
vocabulary = generate_vocabulary(caption_dict)
vocabulary


{'#END#': 0,
 '#PAD#': 1,
 '#START#': 2,
 '#UNK#': 3,
 'a': 4,
 'above': 5,
 'across': 6,
 'air': 7,
 'airplane': 8,
 'along': 9,
 'an': 10,
 'and': 11,
 'animal': 12,
 'another': 13,
 'are': 14,
 'area': 15,
 'around': 16,
 'as': 17,
 'at': 18,
 'back': 19,
 'background': 20,
 'ball': 21,
 'banana': 22,
 'bananas': 23,
 'baseball': 24,
 'bat': 25,
 'bathroom': 26,
 'be': 27,
 'beach': 28,
 'bear': 29,
 'bed': 30,
 'bedroom': 31,
 'behind': 32,
 'being': 33,
 'bench': 34,
 'beside': 35,
 'between': 36,
 'big': 37,
 'black': 38,
 'blue': 39,
 'board': 40,
 'boat': 41,
 'bottle': 42,
 'boy': 43,
 'brick': 44,
 'broccoli': 45,
 'brown': 46,
 'building': 47,
 'buildings': 48,
 'bus': 49,
 'busy': 50,
 'by': 51,
 'cake': 52,
 'car': 53,
 'carrying': 54,
 'cars': 55,
 'cat': 56,
 'catch': 57,
 'cell': 58,
 'chair': 59,
 'cheese': 60,
 'child': 61,
 'city': 62,
 'clear': 63,
 'clock': 64,
 'close': 65,
 'coffee': 66,
 'colored': 67,
 'computer': 68,
 'corner': 69,
 'couch': 70,
 'counter': 71

In [None]:
train_captions_indexed = caption_tokens_to_indices(caption_dict, vocabulary)
MAX_LEN = min(max(len(caption) for caption in train_captions_indexed), 20
              )


In [40]:
train_captions_pad_index = pad_captions(
    train_captions_indexed, vocabulary, MAX_LEN, PAD)
train_captions_pad_index[0]

array([  2,   4, 336,  87, 168, 151, 293, 126, 226, 340,   4, 278,  12,
         0,   1,   1,   1,   1,   1,   1])

In [42]:
padded_caption_map = map_padded_indices_to_file_names(
    train_captions_pad_index, caption_dict)


In [57]:
data_array = np.array(list(padded_caption_map.items()), dtype=object)
np.save('image_to_index_caption.npy', data_array)


In [56]:
np.save('vocabulary.npy', vocabulary)
