In [2]:
import torchvision.models as models
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from pathlib import Path
import spacy
spacy_eng = spacy.load("en_core_web_sm")
import nltk
from nltk import word_tokenize
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pad_sequence

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)

In [None]:
model = models.inception_v3(pretrained = True)

In [None]:
model

In [3]:
df = pd.read_csv("Z:/Master I/NLP - Foundations NLP/Image_Caption_Generator/datasets/flickr8k/captions.txt")
imgs,captions = df["image"],df["caption"]
df["caption"]

0        A child in a pink dress is climbing up a set o...
1                    A girl going into a wooden building .
2         A little girl climbing into a wooden playhouse .
3        A little girl climbing the stairs to her playh...
4        A little girl in a pink dress going into a woo...
                               ...                        
40450             A man in a pink shirt climbs a rock face
40451             A man is rock climbing high in the air .
40452    A person in a red shirt climbing up a rock fac...
40453                      A rock climber in a red shirt .
40454    A rock climber practices on a rock climbing wa...
Name: caption, Length: 40455, dtype: object

In [27]:
def tokenizer_eng(text):
    return nltk.word_tokenize(text.lower())


In [28]:
tokenizer_eng(captions[1])

['a', 'girl', 'going', 'into', 'a', 'wooden', 'building', '.']

In [21]:
tokenizer_eng(captions[1])

['a', 'girl', 'going', 'into', 'a', 'wooden', 'building', '.']

In [48]:
def build_vocabulary(sentence_list):
    frequencies = {}
    idx = 4
    itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>"}
    stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}
    for sentence in sentence_list:
        for word in tokenizer_eng(sentence):
            if word not in frequencies:
                frequencies[word] = 1
            else:
                frequencies[word] += 1

            if frequencies[word] == 5:
                stoi[word] = idx
                itos[idx] = word
                idx += 1
                
    return itos,stoi

In [None]:
itos, stoi = build_vocabulary(captions)
stoi

In [None]:
itos

In [1]:
captions[1]

NameError: name 'captions' is not defined

In [65]:
def to_numerical(text):

    tokenized_text = nltk.word_tokenize(text.lower())
    numerical_converted_list = []
    for token in tokenized_text:
        if token in stoi.keys():
            numerical_converted_list.append(stoi[token])
        else:
            numerical_converted_list.append(stoi["<UNK>"])
    return numerical_converted_list

to_numerical(captions[1])

[4, 7, 313, 76, 4, 154, 74, 5]

In [62]:
to_numerical(captions[1])

[4, 7, 313, 76, 4, 154, 74, 5]

In [52]:
class Vocabulary:
    def __init__(self, freq_threshold):
        # freq treshdold =  if a word is not repeated
        # this nb of times in all captions ignore ig
        self.itos = {0: "<PAD>", 1: "<START>", 2: "<END>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<START>": 1, "<END>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def build_vocab(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in nltk.word_tokenize(sentence.lower()):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def to_numerical(self, text):
        tokenized_text = nltk.word_tokenize(text.lower())
        numerical_converted_list = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numerical_converted_list.append(self.stoi[token])
            else:
                numerical_converted_list.append(self.stoi["<UNK>"])
        return numerical_converted_list

In [53]:
class ImageCaptioningDataset(Dataset):
    def __init__(self,
                 dataset_dir: Path,
                 transform=None,
                 freq_threshold: int=5,
                 flag: str="RGB"):

        self.root_dir = dataset_dir/"images"
        self.df = pd.read_csv(dataset_dir/"captions.txt")
        self.transform = transform
        self.flag = flag

        # Get img, caption columns
        self.imgs,self.captions = self.df["image"],self.df["caption"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocab(self.captions.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        img = Image.open(self.root_dir/str(img_id)).convert(self.flag)

        if self.transform is not None:
            img = self.transform(img)

        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return img, torch.tensor(numericalized_caption)

In [67]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return imgs, targets

In [35]:
transform = transforms.Compose(
                    [transforms.Resize((224, 224)),
                    transforms.ToTensor(),])

dataset = ImageCaptioningDataset(Path("Z:/Master I/NLP - Foundations NLP/Image_Caption_Generator/datasets/flickr8k"),
                                 transform = transform,
                                 flag = "RGB")

In [37]:
dataset.vocab.itos

{0: '<PAD>',
 1: '<SOS>',
 2: '<EOS>',
 3: '<UNK>',
 4: 'a',
 5: '.',
 6: 'dog',
 7: 'girl',
 8: 'in',
 9: 'little',
 10: 'the',
 11: 'of',
 12: 'with',
 13: 'on',
 14: 'man',
 15: 'bench',
 16: 'and',
 17: 'is',
 18: 'hat',
 19: 'at',
 20: 'black',
 21: 'white',
 22: 'grass',
 23: 'front',
 24: 'an',
 25: 'orange',
 26: 'red',
 27: 'to',
 28: 'child',
 29: 'running',
 30: 'brown',
 31: 'snow',
 32: 'climbing',
 33: 'rock',
 34: 'are',
 35: 'near',
 36: 'ball',
 37: 'through',
 38: 'yellow',
 39: 'catch',
 40: 'over',
 41: 'playing',
 42: 'field',
 43: 'boy',
 44: 'water',
 45: ',',
 46: 'sitting',
 47: 'next',
 48: 'by',
 49: 'lake',
 50: 'two',
 51: 'dogs',
 52: 'beach',
 53: 'large',
 54: 'ice',
 55: 'wall',
 56: 'person',
 57: 'blue',
 58: 'green',
 59: 'toy',
 60: 'his',
 61: 'young',
 62: 'mouth',
 63: 'lab',
 64: 'it',
 65: 'street',
 66: 'hockey',
 67: 'up',
 68: 'people',
 69: 'standing',
 70: 'outside',
 71: 'three',
 72: 'grassy',
 73: 'while',
 74: 'building',
 75: 'stands'