In [55]:
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO


class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        """输入一个词, return该词的index  如: vocab('abc')   6 """
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

# 加载COCO数据集的Annotation

In [15]:
dataset_root = '/Volumes/SD/Dataset/coco/'
caption_path = dataset_root+'annotations/captions_train2014.json'  # path for train annotation file
vocab_path = dataset_root+'vocab.pkl'  #path for saving vocabulary wrapper
threshold=4  #'minimum word count threshold

In [16]:
# COCO加载 annotation_file.json 用于描述图片
# {"image_id": 190581, "id": 18234, "caption": "A gentleman, view in the mirror, looking at the bike in garage."}
coco = COCO(caption_path)

loading annotations into memory...
Done (t=0.95s)
creating index...
index created!


# 遍历id, 提取描述, 转为tokens

In [25]:
# 生成一个计数器, 后续用于去除频率过低的词
counter = Counter()
ids = coco.anns.keys()
for i, id in enumerate(ids):
    caption = str(coco.anns[id]['caption'])                           # 'A very clean and well decorated empty bathroom'
    tokens = nltk.tokenize.word_tokenize(caption.lower())     # ['a', 'very', 'clean', 'and', 'well', 'decorated', 'empty', 'bathroom']
    counter.update(tokens)
    if (i+1) % 1000 == 0:
        print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))
        break

[1000/414113] Tokenized the captions.


In [57]:
# If the word frequency is less than 'threshold', then the word is discarded. 丢弃频率过低的词
words = [word for word, cnt in counter.items() if cnt >= threshold]

# Create a vocab wrapper and add the words to the vocabulary.
vocab = Vocabulary()
[vocab.add_word(w) for w in ['<pad>', '<start>', '<end>', '<unk>']]
for i, word in enumerate(awords):
    vocab.add_word(word)

# Save

In [56]:
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

Total vocabulary size: 368
Saved the vocabulary wrapper to '/Volumes/SD/Dataset/coco/vocab.pkl'
