# Load Data

In [9]:
with open('captions.txt', 'r') as file:
    lines = file.readlines()[1:]
    
images = []
captions = []

for line in lines:
    image, caption = line.strip().split(',', 1)
    images.append(image)
    captions.append(caption)

In [10]:
images

['1000268201_693b08cb0e.jpg',
 '1000268201_693b08cb0e.jpg',
 '1000268201_693b08cb0e.jpg',
 '1000268201_693b08cb0e.jpg',
 '1000268201_693b08cb0e.jpg',
 '1001773457_577c3a7d70.jpg',
 '1001773457_577c3a7d70.jpg',
 '1001773457_577c3a7d70.jpg',
 '1001773457_577c3a7d70.jpg',
 '1001773457_577c3a7d70.jpg',
 '1002674143_1b742ab4b8.jpg',
 '1002674143_1b742ab4b8.jpg',
 '1002674143_1b742ab4b8.jpg',
 '1002674143_1b742ab4b8.jpg',
 '1002674143_1b742ab4b8.jpg',
 '1003163366_44323f5815.jpg',
 '1003163366_44323f5815.jpg',
 '1003163366_44323f5815.jpg',
 '1003163366_44323f5815.jpg',
 '1003163366_44323f5815.jpg']

In [11]:
captions

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .',
 'A black dog and a spotted dog are fighting',
 'A black dog and a tri-colored dog playing with each other on the road .',
 'A black dog and a white dog with brown spots are staring at each other in the street .',
 'Two dogs of different breeds looking at each other on the road .',
 'Two dogs on pavement moving toward each other .',
 'A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .',
 'A little girl is sitting in front of a large painted rainbow .',
 'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it .',
 'There is a girl with pigtails sitting in front of a rainbow painting .',
 'Young girl with pigtails pain

# Image Preprocessing

Documentation for image transformations see: https://pytorch.org/vision/0.15/transforms.html

### Standard procedure: 

1. Convert image to a (3 x Height x Width) tensor.
2. Normalize values based on mean and std of each RGB channel.
3. Randomly rotate, flip, jitter, and crop the image.
4. Repeat step 3 to gain a list of augmented image tensors.
5. Convert each tensor to shape (1 x 3 x Height x Width) using `unsqueeze(0)`.

In [12]:
from PIL import Image
import torchvision.transforms as transforms

# Apply resizing, normalization, and random series of transformations to generate num augmented images
def preprocess(image, size, num):
    augmented_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        transforms.RandomRotation(degrees=15),  # Random rotation up to 15 degrees
        transforms.RandomHorizontalFlip(p=0.5),  # Random horizontal flip with a probability of 0.5
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Random color jitter
        transforms.RandomResizedCrop(299, scale=(0.8, 1.0), ratio=(1, 1))  # Random resized crop
    ])
    processed_images = []
    for _ in range(num):
        augmented_image = augmented_transforms(image)
        batch_input = augmented_image.unsqueeze(0)
        processed_images.append(batch_input)
    return processed_images

### Feature extraction using pretrained model Inception_V3
All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are expected to be at least 299. The images have to be loaded in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].

In [1]:
import torch
model = torch.hub.load('pytorch/vision:v0.10.0', 'inception_v3', pretrained=True)
model.eval()

def encode_image(image):
    with torch.no_grad():
        output = model(image)
        # The output has unnormalized scores. To get probabilities, you can run a softmax on it.
        probabilities = torch.nn.functional.softmax(output)
    return output, probabilities

Using cache found in C:\Users\micha/.cache\torch\hub\pytorch_vision_v0.10.0


In [22]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

Tokenization: convert text sentences to list of dictionary index. 

In [23]:
def build_vocab(text_data):
    tokenizer = get_tokenizer('basic_english')
    # Tokenize the text data
    tokenized_data = [tokenizer(text) for text in text_data]
    # Build vocabulary from tokenized data
    vocab = build_vocab_from_iterator(tokenized_data, specials=["<unk>", "<pad>"])
    return tokenized_data, vocab

In [24]:
tokenized_data, vocab = build_vocab(captions)
numericalized_data = [torch.tensor([vocab[token] for token in tokens]) for tokens in tokenized_data]

print("Tokenized Data:")
print(tokenized_data)
print("Numericalized Data:")
print(numericalized_data)
print("Vocabulary Size:", len(vocab))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
Embedding

In [25]:
from torchtext.vocab import GloVe

# Define the vector dimension (50, 100, 200, or 300 are common choices)
vector_dim = 100

# Load pre-trained GloVe vectors
# glove = GloVe(name='6B', dim=vector_dim)

for word in vocab.stoi.items():
    word_vector = glove[word]
    print(f'Word vector for "{word}": {word_vector}')
    
    print(f'Vocabulary size: {len(glove.itos)}')
    print(f'Vector dimension: {glove.vectors.size(1)}')

# Glove vector model too large to import

AttributeError: 'Vocab' object has no attribute 'stoi'