In [None]:
ROOT = 'datasets'
DATASET = 'coco'
ANNOTATIONS_PATH = 'annotations/captions_{0}2014.json'
IMAGES_PATH = 'images/{0}2014'

In [None]:
import os

In [None]:
import torchvision.datasets as dset
import torchvision.transforms as transforms

In [None]:
transform = transforms.Compose(
    [transforms.Resize((200, 200)),
    transforms.ToTensor()
     ])

cap = dset.CocoCaptions(root = os.path.join(ROOT, DATASET, IMAGES_PATH.format('train')),
                        annFile = os.path.join(ROOT, DATASET, ANNOTATIONS_PATH.format('train')),
                        transform = transform)

print('Number of samples: ', len(cap))

In [None]:
from IPython.display import display

display(cap[0][0])
print(cap[0][1])

# Create mini-dataset

In [None]:
import json

In [None]:
with open(os.path.join(ROOT, DATASET, ANNOTATIONS_PATH.format('train'))) as f:
    d = json.load(f)

In [None]:
image_ids = []
d['images'] = d['images'][:1]
for image in d['images']:
    image_ids.append(image['id'])

In [None]:
annotations = []
for annotation in d['annotations']:
    if annotation['image_id'] in image_ids:
        annotations.append(annotation)
d['annotations'] = annotations

In [None]:
os.mkdir(os.path.join(ROOT, 'mini_coco'))

In [None]:
image_filenames = []
for image in d['images']:
    image_filenames.append(image['file_name'])

In [None]:
os.mkdir(os.path.join(ROOT, 'mini_coco', 'images'))
os.mkdir(os.path.join(ROOT, 'mini_coco', 'annotations'))

In [None]:
os.mkdir(os.path.join(ROOT, 'mini_coco', IMAGES_PATH.format('train')))

In [None]:
from shutil import copyfile

for filename in image_filenames:
    copyfile(os.path.join(ROOT, DATASET, IMAGES_PATH.format('train'), filename),
             os.path.join(ROOT, 'mini_coco', IMAGES_PATH.format('train'), filename))

In [None]:
with open(os.path.join(ROOT, 'mini_coco', ANNOTATIONS_PATH.format('train')), 'w') as f:
    json.dump(d, f)

# Prepare dictionary

In [None]:
cap = dset.CocoCaptions(root = os.path.join(ROOT, 'mini_coco', IMAGES_PATH.format('train')),
                        annFile = os.path.join(ROOT, 'mini_coco', ANNOTATIONS_PATH.format('train')))

In [None]:
from nltk.tokenize import word_tokenize
import string
from collections import defaultdict

c = defaultdict(int)

for image, texts in cap:
    for text in texts:
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.lower()
        text = word_tokenize(text)
        for word in text:
            c[word] += 1

In [None]:
c_filtered = [word for word in c if c[word] > 0]

In [None]:
START = '<START>'
UNK = '<UNK>'
END = '<END>'

c_filtered.append(START)
c_filtered.append(UNK)
c_filtered.append(END)

In [None]:
i2w = {}
w2i = {}

for index, word in enumerate(c_filtered):
    i2w[index] = word
    w2i[word] = index

In [None]:
print(i2w)
print(w2i)

# Transform text to index sequence

In [None]:
def transform_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = word_tokenize(text)
    
    sequence = [w2i[START]]
    for word in text:
        if word in w2i:
            sequence.append(w2i[word])
        else:
            sequence.append(w2i[UNK])
    sequence.append(w2i[END])
    return sequence

In [None]:
transform_text('A pen with, a, kitchen!!.')

# Train model

In [None]:
import torch

In [None]:
def collate_fn(batch):
    print(len(batch))
    return batch

In [None]:
trainloader = torch.utils.data.DataLoader(cap, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [None]:
b = None

for batch in trainloader:
    b = batch
    break

In [None]:
cap[0][0].size()

In [None]:
cap[1][0].size()

In [None]:
b[1]

In [None]:
import torch
from torch import nn

In [None]:
embed = nn.Embedding(18, 5)

In [None]:
inp = torch.LongTensor([[1, 2, 3, 4, 5], [5, 6]])

In [None]:
embed(inp).shape

In [None]:
from torch.nn.utils.rnn import pack_sequence
a = torch.tensor([1,2,3])
b = torch.tensor([4,5])
c = torch.tensor([6])
pack_sequence([a, b, c])

In [None]:
packed.data = embed(packed.data).shape

In [None]:
nn.utils.rnn.PackedSequence(
    a, torch.tensor([1, 2]))

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, *args, **kwargs):
        super(SimpleModel, self).__init__(*args, **kwargs)
        self.encoder = nn.Conv2d(3, 5, 3)
        self.embeddings = nn.Embedding(18, 7)
        self.decoder = nn.RNN(7, 3)
        self.linear = nn.Linear(196020, 3)
        self.last_linear = nn.Linear(3, 18)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, image, captions):
        h0 = self.linear(self.encoder(image).view(-1, 196020))
        h0 = h0.repeat(1, 5, 1)
        packed_embeds = nn.utils.rnn.PackedSequence(self.embeddings(captions.data), captions.batch_sizes)
        decoded, _ = self.decoder(packed_embeds, h0)
        probs = self.softmax(self.last_linear(decoded.data))
        return nn.utils.rnn.PackedSequence(probs, decoded.batch_sizes)

In [None]:
def collate_fn(batch):
    return batch

trainloader = torch.utils.data.DataLoader(cap, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [None]:
sample_batch = None

for b in trainloader:
    sample_batch = b
    break

In [None]:
sample_batch

In [None]:
transform = transforms.Compose(
    [transforms.Resize((200, 200)),
    transforms.ToTensor()
     ])

image = transform(sample_batch[0][0])

In [None]:
image = torch.unsqueeze(image, 0)

In [None]:
transformed_texts = []

for text in sample_batch[0][1]:
    transformed_texts.append(torch.tensor(transform_text(text)))

In [None]:
transformed_texts

In [None]:
packed = pack_sequence(transformed_texts, enforce_sorted=False)

In [None]:
model = SimpleModel()
ans = model.forward(image, packed)

In [None]:
ans[0].data.shape

In [None]:
output, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(ans)

In [None]:
input_sizes

In [None]:
transformed_texts

In [None]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()

In [None]:
input.shape

In [None]:
target.shape

In [None]:
loss = nn.CrossEntropyLoss()

In [None]:
ans[0].shape

In [None]:
packed.data.shape

In [None]:
loss(ans[0], packed.data)

In [None]:
model.parameters()