In [None]:
import os
import json
import nltk
import pickle
from collections import Counter

from PIL import Image

from vocabulary import Vocabulary

In [None]:
def resize_images(input_path, output_path, new_size):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    image_files = os.listdir(input_path)
    num_images = len(image_files)
    for i, img in enumerate(image_files):
        img_full_path = os.path.join(input_path, img)
        with open(img_full_path, 'r+b') as f:
            with Image.open(f) as image:
                image = image.resize(new_size, Image.ANTIALIAS)
                img_sv_full_path = os.path.join(output_path, img)
                image.save(img_sv_full_path, image.format)
        if (i+1) % 100 == 0 or (i+1) == num_images:
            print("Resized {} out of {} total images.".format(i+1, num_images))

input_path = './coco_data/images/'
output_path = './coco_data/resized_images/'
new_size = [256, 256]
resize_images(input_path, output_path, new_size)

!rm -rf ./coco_data/images
!mv ./coco_data/resized_images ./coco_data/images

In [None]:
def build_vocabulary(json_path, threshold):
  with open(json_path) as json_file:
    captions = json.load(json_file)
  counter = Counter()
  i = 0
  for annotation in captions['annotations']:
    i = i + 1
    caption = annotation['caption']
    tokens = nltk.tokenize.word_tokenize(caption.lower())
    counter.update(tokens)
    if i % 1000 == 0 or i == len(captions['annotations']):
      print("Tokenized {} out of total {} captions.".format(i, len(captions['annotations'])))

  # If the word frequency is less than 'threshold', then the word is discarded.
  tokens = [tkn for tkn, i in counter.items() if i >= threshold]

  # Create a vocabulary wrapper and add some special tokens.
  vocabulary = Vocabulary()
  vocabulary.add_token('<pad>')
  vocabulary.add_token('<start>')
  vocabulary.add_token('<end>')
  vocabulary.add_token('<unk>')

  # Add the words to the vocabulary.
  for i, token in enumerate(tokens):
    vocabulary.add_token(token)
  return vocabulary

vocabulary = build_vocabulary(json_path='coco_data/captions.json', threshold=4)
vocabulary_path = './coco_data/vocabulary.pkl'
with open(vocabulary_path, 'wb') as f:
    pickle.dump(vocabulary, f)
print("Total vocabulary size: {}".format(len(vocabulary)))