In [1]:
ROOT = 'datasets'
DATASET = 'coco'
ANNOTATIONS_PATH = 'annotations/captions_{0}2014.json'
IMAGES_PATH = 'images/{0}2014'

In [2]:
import os

In [45]:
import torchvision.datasets as dset
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.Resize((200, 200)),
    transforms.ToTensor()
     ])

cap = dset.CocoCaptions(root = os.path.join(ROOT, DATASET, IMAGES_PATH.format('train')),
                        annFile = os.path.join(ROOT, DATASET, ANNOTATIONS_PATH.format('train')),
                        transform = transform)

print('Number of samples: ', len(cap))

loading annotations into memory...
Done (t=0.53s)
creating index...
index created!
Number of samples:  82783


In [40]:
from IPython.display import display

display(cap[0][0])
print(cap[0][1])

tensor([[[0.3216, 0.3098, 0.3333,  ..., 0.4627, 0.5294, 0.7647],
         [0.3216, 0.2980, 0.3373,  ..., 0.4745, 0.5451, 0.7490],
         [0.3137, 0.3020, 0.3216,  ..., 0.4706, 0.4824, 0.5176],
         ...,
         [0.1922, 0.1647, 0.1490,  ..., 0.6235, 0.6118, 0.5961],
         [0.1922, 0.1843, 0.1765,  ..., 0.6078, 0.6039, 0.6039],
         [0.2471, 0.2431, 0.2275,  ..., 0.6000, 0.5882, 0.6000]],

        [[0.1686, 0.1765, 0.1804,  ..., 0.3529, 0.4118, 0.5765],
         [0.1490, 0.1647, 0.1490,  ..., 0.3686, 0.4353, 0.5922],
         [0.1529, 0.1569, 0.1451,  ..., 0.3608, 0.3843, 0.4118],
         ...,
         [0.0902, 0.0706, 0.0627,  ..., 0.6588, 0.6471, 0.6431],
         [0.0706, 0.0745, 0.0824,  ..., 0.6471, 0.6392, 0.6353],
         [0.1333, 0.1333, 0.1216,  ..., 0.6353, 0.6235, 0.6196]],

        [[0.1020, 0.1059, 0.1176,  ..., 0.3647, 0.4039, 0.5294],
         [0.0902, 0.0863, 0.0902,  ..., 0.3686, 0.4275, 0.5569],
         [0.0980, 0.1137, 0.0863,  ..., 0.3490, 0.3765, 0.

['A restaurant has modern wooden tables and chairs.', 'A long restaurant table with rattan rounded back chairs.', 'a long table with a plant on top of it surrounded with wooden chairs ', 'A long table with a flower arrangement in the middle for meetings', 'A table is adorned with wooden chairs with blue accents.']


# Create mini-dataset

In [91]:
import json

In [97]:
with open(os.path.join(ROOT, DATASET, ANNOTATIONS_PATH.format('train'))) as f:
    d = json.load(f)

In [99]:
image_ids = []
d['images'] = d['images'][:5]
for image in d['images']:
    image_ids.append(image['id'])

In [110]:
annotations = []
for annotation in d['annotations']:
    if annotation['image_id'] in image_ids:
        annotations.append(annotation)
d['annotations'] = annotations

In [111]:
os.mkdir(os.path.join(ROOT, 'mini_coco'))

In [113]:
image_filenames = []
for image in d['images']:
    image_filenames.append(image['file_name'])

In [114]:
os.mkdir(os.path.join(ROOT, 'mini_coco', 'images'))
os.mkdir(os.path.join(ROOT, 'mini_coco', 'annotations'))

In [117]:
os.mkdir(os.path.join(ROOT, 'mini_coco', IMAGES_PATH.format('train')))

FileExistsError: [Errno 17] File exists: 'datasets/mini_coco/images/train2014'

In [118]:
from shutil import copyfile

for filename in image_filenames:
    copyfile(os.path.join(ROOT, DATASET, IMAGES_PATH.format('train'), filename),
             os.path.join(ROOT, 'mini_coco', IMAGES_PATH.format('train'), filename))

In [123]:
with open(os.path.join(ROOT, 'mini_coco', ANNOTATIONS_PATH.format('train')), 'w') as f:
    json.dump(d, f)

# Prepare dictionary

In [124]:
cap = dset.CocoCaptions(root = os.path.join(ROOT, 'mini_coco', IMAGES_PATH.format('train')),
                        annFile = os.path.join(ROOT, 'mini_coco', ANNOTATIONS_PATH.format('train')))

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [125]:
from nltk.tokenize import word_tokenize
import string
from collections import Counter

c = {}

for image, texts in cap:
    for text in texts:
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.lower()
        text = word_tokenize(text)
        for word in text:
            c[word] += 1

In [135]:
c_filtered = [word for word in c if c[word] > 1]

In [138]:
UNK = '<UNK>'
END = '<END>'

c_filtered.append(UNK)
c_filtered.append(END)

In [150]:
i2w = {}
w2i = {}

for index, word in enumerate(c_filtered):
    i2w[index] = word
    w2i[word] = index

In [151]:
print(i2w)
print(w2i)

{0: 'a', 1: 'and', 2: 'with', 3: 'of', 4: 'in', 5: 'is', 6: 'kitchen', 7: '<UNK>', 8: '<END>'}
{'a': 0, 'and': 1, 'with': 2, 'of': 3, 'in': 4, 'is': 5, 'kitchen': 6, '<UNK>': 7, '<END>': 8}


# Transform text to index sequence

In [152]:
def transform_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = word_tokenize(text)
    
    sequence = []
    for word in text:
        if word in w2i:
            sequence.append(w2i[word])
        else:
            sequence.append(w2i[UNK])
    sequence.append(w2i[END])
    return sequence

In [154]:
transform_text('A pen with, a, kitchen!!.')

[0, 7, 2, 0, 6, 8]

# Train model

In [20]:
import torch

In [66]:
def collate_fn(batch):
    print(len(batch))
    return batch

In [67]:
trainloader = torch.utils.data.DataLoader(cap, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [68]:
b = None

for batch in trainloader:
    b = batch
    break

4


In [46]:
cap[0][0].size()

torch.Size([3, 200, 200])

In [47]:
cap[1][0].size()

torch.Size([3, 200, 200])

In [64]:
b[1]

[('A white bus is parked on a road near a small hill and the ocean.',
  'Grey storm clouds loom over a city street in a business district.',
  'A train car with four beds next to a window.',
  'a silver and black train engine and some people '),
 ('A van parked at the side of a graveled road.',
  'An empty sign stands near a city street.',
  'a room with two bunk beds in it next to a big window ',
  'A man showing some visitors a steam engine.'),
 ('A bus sitting next to a body of water on a gravel road.',
  'A sign that is standing in a parking lot.',
  'Beds are in a small room near a window and table with bottles on it. ',
  'An old fashion train engine is being observed by a group of people. '),
 ('A van is pulled up to a boat docking area while a cow stands alongside the signs.',
  'Empty commercial building on a city street corner.',
  'Two beers sit on a table between bunk beds.',
  'A train that is on some train tracks.'),
 ('There is a can pulled over on the side of the road n