In [1]:
import sys
import torch
from pathlib import Path

print('Python version:', sys.version)
sys.path.append('..')

from src.datasets import *

Python version: 3.10.5 (main, Dec 17 2022, 19:41:24) [Clang 14.0.0 (clang-1400.0.29.202)]


In [2]:
dumb_dataset = DumbImageCaptionDataset(dataset_path=Path('/tmp'), split='train', train_size=20)
dumb_dataset

[32m2023-12-11 16:08:50.549[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.base[0m:[36m__init__[0m:[36m76[0m - [34m[1mDataset ImageCaption path: /tmp | Number of samples: 20[0m


<src.datasets.dumb.DumbImageCaptionDataset at 0x10a84f9a0>

In [3]:
dumb_dataset[0][0].shape, dumb_dataset[0][1]

(torch.Size([3, 64, 64]), 'Lorem ipsum dolor sit amet, consect')

In [4]:
dataloader = torch.utils.data.DataLoader(dumb_dataset, batch_size=4, shuffle=True)
len(dataloader)

5

In [5]:
for images, captions in dataloader:
    print(images.shape, len(captions))

torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4


In [6]:
dataset = get_dataset(
    {
        "name": 'dumb_image_caption',
        "dataset_path": Path('/tmp'),
        "split": 'train',
        "train_size": 20
    }
)
dataset

[32m2023-12-11 16:08:50.705[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.factory[0m:[36mget_dataset[0m:[36m71[0m - [34m[1mTransforms should be provided as a dict with keys 'image' or 'caption'. Provided transforms image <class 'NoneType'> and caption <class 'NoneType'>[0m
[32m2023-12-11 16:08:50.758[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.base[0m:[36m__init__[0m:[36m76[0m - [34m[1mDataset ImageCaption path: /tmp | Number of samples: 20[0m


(<src.datasets.dumb.DumbImageCaptionDataset at 0x10a84f7f0>, None)

In [7]:
dumb_dataset[0][0].shape, dumb_dataset[0][1]

(torch.Size([3, 64, 64]), 'Lorem ipsum dolor sit am')

In [8]:
dataloader = torch.utils.data.DataLoader(dumb_dataset, batch_size=4, shuffle=True)
len(dataloader)

5

In [9]:
for images, captions in dataloader:
    print(images.shape, len(captions))

torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4
torch.Size([4, 3, 64, 64]) 4


## With transforms

In [10]:
import transformers
from torchvision import transforms
from functools import partial


In [11]:
MAX_LEN = 128
vision_transform = transforms.Compose([
    transforms.Resize(244), transforms.CenterCrop(224), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
original_caption_transform = transformers.AutoTokenizer.from_pretrained("roberta-base")
caption_transform = partial(original_caption_transform, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LEN)

In [12]:
dumb_dataset = DumbImageCaptionDataset(dataset_path=Path('/tmp'), split='train', train_size=20, image_transform=vision_transform)
dumb_dataset

[32m2023-12-11 16:08:53.500[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.base[0m:[36m__init__[0m:[36m76[0m - [34m[1mDataset ImageCaption path: /tmp | Number of samples: 20[0m


<src.datasets.dumb.DumbImageCaptionDataset at 0x122b9ebf0>

In [13]:
dumb_dataset[0][0].shape, dumb_dataset[0][1], caption_transform(dumb_dataset[0][1])



(torch.Size([3, 224, 224]),
 'Lorem ipsum dolor sit amet, consectetur adipisc',
 {'input_ids': tensor([[    0,   574, 43375,  1437,  7418,   783,   385,     2,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,   

In [14]:
dataloader = torch.utils.data.DataLoader(dumb_dataset, batch_size=4, shuffle=True)
len(dataloader)

5

In [15]:
for images, captions in dataloader:
    encoded_captions = caption_transform(captions)
    print(images.shape, len(captions), encoded_captions['input_ids'].shape, encoded_captions['attention_mask'].shape)

torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])


In [16]:
dataset = get_dataset(
    {
        "name": 'dumb_image_caption',
        "dataset_path": Path('/tmp'),
        "split": 'train',
        "train_size": 20,
        "text_max_length": 128,
    },
    {
        "image": vision_transform,
        "caption": original_caption_transform,
    }
)
dataset

[32m2023-12-11 16:08:53.614[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.factory[0m:[36mget_dataset[0m:[36m71[0m - [34m[1mTransforms should be provided as a dict with keys 'image' or 'caption'. Provided transforms image <class 'torchvision.transforms.transforms.Compose'> and caption <class 'functools.partial'> of <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>[0m
[32m2023-12-11 16:08:53.616[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.base[0m:[36m__init__[0m:[36m76[0m - [34m[1mDataset ImageCaption path: /tmp | Number of samples: 20[0m


(<src.datasets.dumb.DumbImageCaptionDataset at 0x12088abc0>, None)

In [17]:
dataloader = torch.utils.data.DataLoader(dumb_dataset, batch_size=4, shuffle=True)
len(dataloader)

5

In [18]:
for images, captions in dataloader:
    encoded_captions = caption_transform(captions)
    print(images.shape, len(captions), encoded_captions['input_ids'].shape, encoded_captions['attention_mask'].shape)

torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])


# CCM3

In [19]:
MAX_LEN = 128
vision_transform = transforms.Compose([
    transforms.ToTensor(), transforms.Resize(244), transforms.CenterCrop(224), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
original_caption_transform = transformers.AutoTokenizer.from_pretrained("roberta-base")
caption_transform = partial(original_caption_transform, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LEN)

In [21]:
ccm3_dataset = CC3MDataset(dataset_path=Path('../ccm3_data/extracted'), train_size=0.8, image_transform=vision_transform, caption_transform=caption_transform)
ccm3_dataset

[32m2023-12-11 16:12:24.358[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.base[0m:[36m__init__[0m:[36m76[0m - [34m[1mDataset ImageCaption path: ../ccm3_data/extracted | Number of samples: 16142[0m


<src.datasets.ccm.CC3MDataset at 0x12301d270>

In [22]:
ccm3_dataset[0][0].shape, ccm3_dataset[0][1]

(torch.Size([3, 224, 224]),
 {'input_ids': tensor([[    0,  7907,     9,    10, 45140,    24,   115,   185,    10,   251,
             86,     7,  1719, 16499,    11, 32657,   479,     2,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1

In [23]:
dataloader = torch.utils.data.DataLoader(ccm3_dataset, batch_size=4, shuffle=True)
len(dataloader)

3229

In [24]:
for images, captions in dataloader:
    print(images.shape, len(captions), encoded_captions['input_ids'].shape, encoded_captions['attention_mask'].shape)

torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 2 torch.Size([4, 128])

In [25]:
vision_transform = transforms.Compose([
    transforms.Resize(224), transforms.CenterCrop(224), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = get_dataset(
    {
        "name": 'cc3m',
        "dataset_path": Path('../ccm3_data/extracted'),
        "train_size": 0.8,
        "text_max_length": 128,
    },
    {
        "image": vision_transform,
        "caption": original_caption_transform,
    }
)
dataset

[32m2023-12-11 16:16:07.551[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.factory[0m:[36mget_dataset[0m:[36m71[0m - [34m[1mTransforms should be provided as a dict with keys 'image' or 'caption'. Provided transforms image <class 'torchvision.transforms.transforms.Compose'> and caption <class 'functools.partial'> of <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>[0m
[32m2023-12-11 16:16:07.621[0m | [34m[1mDEBUG   [0m | [36msrc.datasets.base[0m:[36m__init__[0m:[36m76[0m - [34m[1mDataset ImageCaption path: ../ccm3_data/extracted | Number of samples: 16142[0m


(<src.datasets.ccm.CC3MDataset at 0x12301bd90>, None)

In [26]:
dataloader = torch.utils.data.DataLoader(dumb_dataset, batch_size=4, shuffle=True)
len(dataloader)

5

In [27]:
for images, captions in dataloader:
    encoded_captions = caption_transform(captions)
    print(images.shape, len(captions), encoded_captions['input_ids'].shape, encoded_captions['attention_mask'].shape)

torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
torch.Size([4, 3, 224, 224]) 4 torch.Size([4, 128]) torch.Size([4, 128])
