<a href="https://colab.research.google.com/github/R12942159/NTU_DLCV/blob/Hw3/p2_Image_caption_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install timm

In [2]:
import os
import re
import math
import timm
import json
import torch
import collections
import numpy as np
import torch.nn.functional as F
import torchvision.transforms as tr

from PIL import Image
from tqdm import tqdm
from pathlib import Path
from torch import nn, Tensor
from torch.utils.data import DataLoader

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

Using: cuda


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Download dataset and unzip zip file.

In [None]:
!gdown 1SUiRrG6zQVtyrVSVh9hOBq5_fX-oV2Lh -O hw3_data.zip # 11rP6KmR5Qwjhx0rfag0b5TZGBTRuPtQR
!unzip /content/hw3_data.zip

#### Tokenizer ('<|endoftext|>', 50256) -> 250dim

In [6]:
class BPETokenizer:

    def __init__(self, encoder_file, vocab_file):
        with open(encoder_file, 'r', encoding='utf-8') as f:
            self.encoder = json.load(f)
        self.decoder = {v:k for k,v in self.encoder.items()}
        with open(vocab_file, 'r', encoding='utf-8') as f:
            vocab = f.read().split('\n')[1:-1]
        self.bpe_ranks = {tuple(line.split()): i for i, line in enumerate(vocab)}
        assert len(self.encoder) == 50257 and len(self.bpe_ranks) == 49999 # len(self.bpe_ranks) == 50000
        bs = list(range(33, 127)) + list(range(161, 256))
        xs = list(range(0, 33)) + list(range(127, 161))
        cs = bs[:] + [2**8 + i for i in range(len(xs))]
        self.byte_encoder = dict(zip(bs + xs, [chr(n) for n in cs]))
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}

    def encode(self, text, allowed_special=None):
        tokens = re.findall(r"""<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d| ?""" +
                            r"""\w+| ?\d+| ?[^\s\w\d]+|\s+(?!\S)|\s+""", text, re.UNICODE)
        def translate(token):
            if token == '<|endoftext|>':
                assert allowed_special and token in allowed_special
                return [token]
            word = tuple(''.join(self.byte_encoder[byte] for byte in token.encode('utf-8')))
            while len(word) != 1:
                pairs = set((word[i], word[i+1]) for i in range(len(word)-1))
                bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
                if bigram not in self.bpe_ranks:
                    break
                a, b = bigram
                new_word = []
                i = 0
                while i < len(word):
                    j = word.index(a, i) if a in word[i:] else len(word)
                    new_word.extend(word[i:j])
                    i = j
                    if i < len(word):
                        j = 2 if i < len(word)-1 and word[i] == a and word[i+1] == b else 1
                        new_word.append(a+b if j == 2 else word[i])
                        i += j
                word = tuple(new_word)
            return word
        return [self.encoder[_] for token in tokens for _ in translate(token)]

    def decode(self, tokens):
        tokens = [self.decoder[token] for token in tokens]
        buffer = bytearray([self.byte_decoder[c] for c in ''.join(tokens)])
        return buffer.decode('utf-8', errors='replace')

In [7]:
encoding = BPETokenizer('/content/encoder.json', '/content/vocab.bpe')
prompt = 'a kitchen with a sink and many cooking machines and a pot of food'

text_embedding_len = 250

context = encoding.encode(prompt)
context = [50256] + context + [50256]*(text_embedding_len - len(context) - 1)
# context
encoding.decode(context)

'<|endoftext|>a kitchen with a sink and many cooking machines and a pot of food<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext

#### Define function

In [8]:
def json_load(json_path: str):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [9]:
def caption_with_id(json_path: str) -> list:
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    data = [{'caption': row['caption'], 'image_id': row['image_id']} for row in json_data['annotations']]
    return data

In [10]:
def id2file_name(json_path: str) -> dict:
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    data = {row['id']: row['file_name'] for row in json_data['images']}
    return data

In [11]:
encoder_joson_path = '/content/encoder.json'
vocab_bpe_path = '/content/vocab.bpe'
def collate_fn(batch, tokenizer=BPETokenizer(encoder_joson_path, vocab_bpe_path)):
    # Get the individual elements of the batch
    images = [item['img'] for item in batch]
    captions = [item['caption'] for item in batch]
    filenames = [item['filename'] for item in batch]

    # Tokenize captions
    tokenized_captions = [tokenizer.encode(caption) for caption in captions]

    # Pad the vector length into stop token to dimension 250
    text_len = 250 # text_embedding_len
    tokenized_captions_train = [
        [50256] + caption + [50256] * (text_len - len(caption) - 1) for caption in tokenized_captions
    ]
    tokenized_captions_inf = [
        caption + [50256] + [-100] * (text_len - len(caption) - 1) for caption in tokenized_captions
    ]

    # Convert tokenized captions to PyTorch tensors
    tokenized_captions_train = [torch.tensor(caption) for caption in tokenized_captions_train]
    tokenized_captions_inf = [torch.tensor(caption) for caption in tokenized_captions_inf]

    # Create a new batch with tokenized captions
    tokenized_batch = {
        'img': torch.stack(images, dim=0),
        'tokenized_captions_train': torch.stack(tokenized_captions_train, dim=0),
        'filename': filenames,
        'tokenized_captions_inf': torch.stack(tokenized_captions_inf, dim=0),
    }

    return tokenized_batch

#### Build Dataset

In [12]:
class ImgCaptionDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, json_path, transform) -> None:
        super(ImgCaptionDataset, self).__init__()
        self.img_dir = img_dir
        self.transform = transform

        # Connect caption -> image_id -> file_name
        self.caption_with_id = caption_with_id(json_path)
        self.id2file_name = id2file_name(json_path)
    def __len__(self) -> int:
        return len(self.caption_with_id)

    def __getitem__(self, idx):
        caption_id = self.caption_with_id[idx]
        file_name = self.id2file_name[caption_id['image_id']]
        img = Image.open(os.path.join(self.img_dir, file_name)).convert('RGB')
        img = self.transform(img)
        return {'img': img, 'caption': caption_id['caption'], 'filename': os.path.splitext(file_name)[0]}

In [13]:
# class ImgDataset(torch.utils.data.Dataset):
#     def __init__(self, root: str, transform) -> None:
#         self.transform = transform
#         self.img_path = [i for i in Path(root).glob("*.jpg")]

#     def __len__(self) -> int:
#         return len(self.img_path)

#     def __getitem__(self, idx):
#         img = Image.open(self.img_path[idx]).convert('RGB')
#         img = self.transform(img)
#         return img, os.path.splitext(self.img_path[idx].name)[0]

#### Build Dataloader

In [13]:
train_ds = ImgCaptionDataset(
    img_dir='/content/hw3_data/p2_data/images/train',
    json_path='/content/hw3_data/p2_data/train.json',
    transform=tr.Compose([
        tr.Resize(224),
        tr.CenterCrop(224),
        tr.ToTensor(),
        tr.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
)
val_ds = ImgCaptionDataset(
    img_dir='/content/hw3_data/p2_data/images/val',
    json_path='/content/hw3_data/p2_data/val.json',
    transform=tr.Compose([
        tr.Resize(224),
        tr.CenterCrop(224),
        tr.ToTensor(),
        tr.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
)

train_loader = DataLoader(
    train_ds,
    batch_size=16,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=4,
)
val_loader = DataLoader(
    val_ds,
    batch_size=1,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=4,
)



#### Config

In [14]:
class Config:

    def __init__(self, checkpoint=None):
        self.n_layer = 12
        self.n_head = 12
        self.n_embd = 768
        self.vocab_size = 50257
        self.block_size = 1024
        self.checkpoint = checkpoint

In [15]:
cfg = Config(checkpoint='/content/hw3_data/p2_data/decoder_model.bin')

#### timm's ViT encoder

In [40]:
# encoder = timm.create_model('vit_large_patch16_224_in21k', pretrained=True) # vit_base_patch16_224_in21k

In [41]:
# for batch in train_loader:
#     img = batch['img']
#     break

In [42]:
# with torch.no_grad():
#     encoder_out = encoder.forward_features(img)

In [43]:
# encoder_out.size()

#### decoder

In [16]:
class Attention(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.c_attn = nn.Linear(cfg.n_embd, 3 * cfg.n_embd)
        self.c_proj = nn.Linear(cfg.n_embd, cfg.n_embd)
        self.n_head = cfg.n_head
        self.n_embd = cfg.n_embd
        size = cfg.block_size
        self.register_buffer('bias', torch.tril(torch.ones(size, size)).view(1, 1, size, size))

    def forward(self, x):
        B, T, C = x.size() # batch, context, embedding
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        return self.c_proj((att @ v).transpose(1, 2).contiguous().view(B, T, C))

class CrossAttention(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.multihead_attn = nn.MultiheadAttention(cfg.n_embd, cfg.n_head, batch_first=True)

    def forward(self, query, encoder_out):
        """
        Q is the source from the decoder, K, V are the sources from the encoder.
        Q: (N, L, Eq), where L is the target embedding dim, Eq is embed_dim and batch_first=True.
        {K, V}: (N, L, E{k,v}), where L is the source embedding dim, E{k,v} is {k,v}_dim and batch_first=True.
        """
        attn_output, attn_output_weights = self.multihead_attn(query, encoder_out, encoder_out)
        return attn_output, attn_output_weights

class Block(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.ln_1 = nn.LayerNorm(cfg.n_embd)
        self.ln_2 = nn.LayerNorm(cfg.n_embd) # add
        self.ln_3 = nn.LayerNorm(cfg.n_embd)
        self.attn = Attention(cfg)
        self.crs_attn = CrossAttention(cfg) # add
        self.mlp = nn.Sequential(collections.OrderedDict([
            ('c_fc', nn.Linear(cfg.n_embd, 4 * cfg.n_embd)),
            ('act', nn.GELU(approximate='tanh')),
            ('c_proj', nn.Linear(4 * cfg.n_embd, cfg.n_embd))
        ]))

    def forward(self, x, encoder_out) -> Tensor: # add
        x = x + self.attn(self.ln_1(x))
        cross_x, weights = self.crs_attn(self.ln_2(x), self.ln_2(encoder_out)) # add
        x = cross_x + x
        x = x + self.mlp(self.ln_3(x))
        return x, weights

class Decoder(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.block_size = cfg.block_size
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(cfg.vocab_size, cfg.n_embd), # 文字投影
            wpe = nn.Embedding(cfg.block_size, cfg.n_embd), # position
            h = nn.Sequential(*[Block(cfg) for _ in range(cfg.n_layer)]), # Nx
            ln_f = nn.LayerNorm(cfg.n_embd)
        ))
        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        # timm's ViT encoder (vit_base_patch16_224_in21k, , vit_huge_patch14_224_in21k)
        self.encoder = timm.create_model('vit_large_patch16_224_in21k', pretrained=True)
        self.linear = nn.Linear(1024, cfg.n_embd) # [16, 197, 1024]
        # load checkpoint
        if self.cfg.checkpoint is not None:
            state_dict = torch.load(self.cfg.checkpoint)
            transposed = [ '.c_attn.weight', '.c_fc.weight', '.c_proj.weight' ]
            for key, value in state_dict.items():
                if any(key.endswith(w) for w in transposed):
                    state_dict[key] = value.t()
            self.transformer.load_state_dict(state_dict, strict=False)

    def forward(self, x: Tensor, img: Tensor) -> Tensor: # add
        x = torch.narrow(x, 1, 0, min(x.size(1), self.block_size))
        pos = torch.arange(x.size()[1], dtype=torch.long, device=x.device).unsqueeze(0)
        x = self.transformer.wte(x) + self.transformer.wpe(pos)
        with torch.no_grad():
            encoder_out = self.encoder.forward_features(img)
        for block in self.transformer.h:
            x, weights = block(x, self.linear(encoder_out))
        x = self.lm_head(self.transformer.ln_f(x)) # add
        return x, weights

#### Decoding test

In [64]:
# decoder = Decoder(cfg)

In [65]:
# for batch in train_loader:
#     img = batch['img']
#     tokenized_captions_train = batch['tokenized_captions_train']
#     tokenized_captions_inf = batch['tokenized_captions_inf']
#     break

In [66]:
# pred = decoder(tokenized_captions_train, img)

In [67]:
# pred.size(), tokenized_captions_train.size(), img.size(), tokenized_captions_inf.size()

In [68]:
# encoding.decode(pred[0].argmax(dim=1).tolist())

In [69]:
# loss_fn = nn.CrossEntropyLoss() # ignore_index=50256

# pred = pred.reshape(-1, 50257)
# tokenized_captions_inf = tokenized_captions_inf.reshape(-1)
# loss_fn(pred, tokenized_captions_inf)

#### Training

In [70]:
# def training(dataloader, model, loss_fn, optimizer):

#     size = len(dataloader.dataset) # number of samples
#     num_batches = len(dataloader) # batches per epoch
#     epoch_loss = 0

#     model.train() # to training mode
#     for batch_i, data in enumerate(tqdm(dataloader)):
#         data['img'] = data['img'].to(device, non_blocking=True)
#         data['tokenized_captions_train'] = data['tokenized_captions_train'].to(device, non_blocking=True)
#         data['tokenized_captions_inf'] = data['tokenized_captions_inf'].to(device, non_blocking=True)

#         # zero the parameter gradients
#         optimizer.zero_grad()

#         # Compute prediction loss
#         pred = model(data['tokenized_captions_train'], data['img'])
#         # reshape to (B, C)
#         data['tokenized_captions_inf'] = data['tokenized_captions_inf'].reshape(-1)
#         pred = pred.reshape(-1, 50257)
#         loss = loss_fn(pred, data['tokenized_captions_inf']) # tokenized captions inf

#         # Optimization by gradients
#         loss.backward() # backpropagation to compute gradients
#         optimizer.step() # update model params

#         # write to logs
#         epoch_loss += loss.item() # tensor -> python value
#     return epoch_loss/num_batches

In [71]:
# def testing(dataloader, model, loss_fn):
#     size = len(dataloader.dataset) # number of samples
#     num_batches = len(dataloader) # batches per epoch

#     model.eval() # model to test mode.
#     epoch_loss, epoch_correct = 0, 0

#     # No gradient for test data
#     with torch.no_grad():
#         for batch_i, data in enumerate(tqdm(dataloader)):
#             data['img'] = data['img'].to(device, non_blocking=True)
#             data['tokenized_captions'] = data['tokenized_captions'].to(device, non_blocking=True)

#             # Compute prediction loss
#             pred = model(data['tokenized_captions'], data['img'])
#             # loss = loss_fn(pred, x)
#             epoch_batch_size = pred.size()[0]
#             loss = [loss_fn(pred[i], data['tokenized_captions'][i]) for i in range(epoch_batch_size)]
#             loss = sum(loss)

#             # write to logs
#             epoch_loss += loss.item()
#             # (B, 250, Class)
#             pred_correct = [(pred[i].argmax(dim=1) == data['tokenized_captions'][i]).sum().item() for i in range(epoch_batch_size)]
#             epoch_correct += sum(pred_correct)

#     return epoch_loss/num_batches, epoch_correct/size

##### Freeze parameters

In [72]:
# model = Decoder(cfg).to(device)

# # Freeze parameters
# for name, param in model.named_parameters():
#     param.requires_grad=False
#     # print(f"{name}: {param.requires_grad}")
# # Unfreeze some parameters
# for i in range(12):
#     model.transformer.h[i].ln_2.weight.requires_grad = True
#     model.transformer.h[i].ln_2.bias.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.in_proj_weight.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.in_proj_bias.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.out_proj.weight.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.out_proj.bias.requires_grad = True
# model.linear.weight.requires_grad = True
# model.linear.bias.requires_grad = True

# trainable_weights = [name for name, param in model.named_parameters() if param.requires_grad == True]
# # list for True
# # for name, param in model.named_parameters():
# #     print(f"{name}: {param.requires_grad}")

In [73]:
# EPOCHS = 7
# loss_fn = nn.CrossEntropyLoss() # ignore_index=50256
# optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# # logs
# logs = {
#     'train_loss': []
# }

# for epoch in tqdm(range(EPOCHS)):
#     train_loss = training(train_loader, model, loss_fn, optimizer)

#     print(f'EPOCH: {epoch:04d} \train_loss: {train_loss:.4f}')

#     logs['train_loss'].append(train_loss)

#     # Save model
#     save_weights = {k: v for k, v in model.state_dict().items() if k in trainable_weights}
#     torch.save(save_weights, f'/content/drive/MyDrive/NTU_DLCV/Hw3/p2_ckpt/trainable_weights_epoch{epoch}_{train_loss:.4f}.pth')
#     print('---------- Model Save ----------')

#### Check the model params less than 35M

In [74]:
# !gdown 1-obsrlcsth-FcJgr1SQ1QNJAq_yxBh9Z -O trainable_weights

Downloading...
From: https://drive.google.com/uc?id=1-obsrlcsth-FcJgr1SQ1QNJAq_yxBh9Z
To: /content/trainable_weights
100% 117M/117M [00:00<00:00, 283MB/s]


In [75]:
# model = Decoder(cfg)
# # Freeze parameters
# for name, param in model.named_parameters():
#     param.requires_grad=False
#     # print(f"{name}: {param.requires_grad}")
# # Unfreeze some parameters
# for i in range(12):
#     model.transformer.h[i].ln_2.weight.requires_grad = True
#     model.transformer.h[i].ln_2.bias.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.in_proj_weight.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.in_proj_bias.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.out_proj.weight.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.out_proj.bias.requires_grad = True
# model.linear.weight.requires_grad = True
# model.linear.bias.requires_grad = True

In [76]:
# model.load_state_dict(torch.load('/content/trainable_weights', map_location=device), strict=False)
# print('Total params: ', sum(params.numel() for params in model.parameters() if params.requires_grad))

Total params:  29154048


#### inference

    1-obsrlcsth-FcJgr1SQ1QNJAq_yxBh9Z, # epoch6
    1-nBgYCOFMbY5Wo55oPuVkfEN6z-2VIlu, # epoch5
    1-ibRnuyRXHyp-dAWoGJ36i3QZRhVrFh7, # epoch4
    1-TfQ8FcyMsZ_o_7YxQ1PKnbpC4n8wYie, # epoch3
    1-R828SOp9x1HGgsdhqd4PC7PeyjZrRFN, # epoch2
    1-QfCpRUdAHmp4lT6qwXjLIzp1h9Kf0PJ, # epoch1

In [None]:
!gdown 1-obsrlcsth-FcJgr1SQ1QNJAq_yxBh9Z -O trainable_weights6 # epoch6
!gdown 1-nBgYCOFMbY5Wo55oPuVkfEN6z-2VIlu -O trainable_weights5 # epoch5
!gdown 1-ibRnuyRXHyp-dAWoGJ36i3QZRhVrFh7 -O trainable_weights4 # epoch4
!gdown 1-TfQ8FcyMsZ_o_7YxQ1PKnbpC4n8wYie -O trainable_weights3 # epoch3
!gdown 1-R828SOp9x1HGgsdhqd4PC7PeyjZrRFN -O trainable_weights2 # epoch2
!gdown 1-QfCpRUdAHmp4lT6qwXjLIzp1h9Kf0PJ -O trainable_weights1 # epoch1

In [18]:
for i in range(6, 0, -1):
    model = Decoder(cfg).to(device)
    model.load_state_dict(torch.load(f'/content/trainable_weights{i}', map_location=device), strict=False)

    evaluation_dict = {}
    for data in tqdm(val_loader):
        img = data['img'].to(device)
        file_name = data['filename']
        start_token = torch.tensor([[50256]]).to(device)

        for i in range(250):
            with torch.no_grad():
                pred, weights = model(start_token, img)
                # print(weights.size())

            out_token = pred.argmax(dim=2)[0][-1]
            start_token = torch.cat((start_token, out_token.unsqueeze(0).unsqueeze(0)), dim=1)
            end_token = torch.sum(start_token[0] == 50256).item()
            if end_token == 2:
                pred_token = start_token[start_token != 50256]
                pred_token = pred_token.tolist()
                pred_caption = encoding.decode(pred_token)
                break

        evaluation_dict[file_name[0]] = pred_caption
        print('\n', 'file name: ', file_name[0], '\caption: ', evaluation_dict[file_name[0]])

    json_string = json.dumps(evaluation_dict, indent=2)  # The indent parameter is optional and adds indentation for better readability
    with open(f'/content/drive/MyDrive/NTU_DLCV/Hw3/p2_output/large_epoch{i}_output.json', 'w') as json_file:
        json_file.write(json_string)
    print(f'---------- Epoch{i} large params Saved ----------')
    with open('output.json', 'w') as json_file:
        json_file.write(json_string)

  model = create_fn(


model.safetensors:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

  0%|          | 0/8946 [00:00<?, ?it/s]

torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  0%|          | 1/8946 [00:08<20:40:35,  8.32s/it]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000346160 \caption:  a room with a bed, chair, and chair.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  0%|          | 2/8946 [00:09<9:34:58,  3.86s/it] 

torch.Size([1, 14, 197])

 file name:  000000116633 \caption:  a skateboarder is doing a trick on a skateboard.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 3/8946 [00:09<6:06:00,  2.46s/it]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  4672056076 \caption:   a young man in a black and white outfit is performing a stunt .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  0%|          | 4/8946 [00:10<4:33:59,  1.84s/it]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  000000322769 \caption:  a man is standing in a bathroom with a toilet top up in the bathroom.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 5/8946 [00:11<3:26:10,  1.38s/it]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000422185 \caption:   a horse is grazing on the ground with some grass
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  0%|          | 6/8946 [00:11<2:40:05,  1.07s/it]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])

 file name:  000000509364 \caption:  a train is pulling into a station.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 7/8946 [00:12<2:15:47,  1.10it/s]

torch.Size([1, 11, 197])

 file name:  000000522464 \caption:  a woman walking down a street holding an umbrella.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 8/8946 [00:12<2:02:32,  1.22it/s]

torch.Size([1, 12, 197])

 file name:  000000345160 \caption:  a woman and a child are riding on a motorcycle.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 9/8946 [00:13<1:53:40,  1.31it/s]

torch.Size([1, 12, 197])

 file name:  000000553852 \caption:  a small boy is skateboarding on a skateboard.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  0%|          | 10/8946 [00:14<1:42:36,  1.45it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000287886 \caption:  a bed is sitting in a small window.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 11/8946 [00:14<1:37:30,  1.53it/s]

torch.Size([1, 11, 197])

 file name:  000000098322 \caption:  a group of people are skiing in the snow.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])


  0%|          | 12/8946 [00:15<1:58:08,  1.26it/s]

torch.Size([1, 20, 197])
torch.Size([1, 21, 197])

 file name:  7030278443 \caption:   a group of people are hugging a large white ball with a large crowd of people in the background .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 13/8946 [00:16<1:55:55,  1.28it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000376362 \caption:  a dog is sitting on a blue boat with a dog on it
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  0%|          | 14/8946 [00:17<1:59:05,  1.25it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  3669472958 \caption:   a man is bending over a tent while a man watches from the sand .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 15/8946 [00:18<1:53:59,  1.31it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000429908 \caption:   a woman is holding a plate of food in her hands .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  0%|          | 16/8946 [00:18<1:55:29,  1.29it/s]

torch.Size([1, 15, 197])

 file name:  000000173997 \caption:   a man and a woman are sitting on a bench in a park.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])


  0%|          | 17/8946 [00:20<2:11:15,  1.13it/s]

torch.Size([1, 20, 197])
torch.Size([1, 21, 197])

 file name:  1362987900 \caption:   a man wearing a hat and a hat is sitting on a table with a guitar in his mouth .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 18/8946 [00:20<2:02:29,  1.21it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000285018 \caption:  a bathroom with a white bathtub and white tile floor.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 19/8946 [00:21<1:49:28,  1.36it/s]

torch.Size([1, 10, 197])

 file name:  000000361376 \caption:  a blue and white bowl with a salad.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 20/8946 [00:21<1:45:22,  1.41it/s]

torch.Size([1, 12, 197])

 file name:  000000422100 \caption:  a skateboarder doing a trick on a skate board
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 21/8946 [00:22<1:44:58,  1.42it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000476383 \caption:  a cat sitting on a desk with a glass of water.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 22/8946 [00:23<1:39:54,  1.49it/s]

torch.Size([1, 11, 197])

 file name:  000000201859 \caption:  a red car is stopped at a stop light.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  0%|          | 23/8946 [00:23<1:36:21,  1.54it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000215303 \caption:  a piece of wine and wine on a table.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 24/8946 [00:24<1:33:48,  1.59it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000581711 \caption:  a plate of food with a fork and fork.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  0%|          | 25/8946 [00:25<1:41:22,  1.47it/s]

torch.Size([1, 15, 197])

 file name:  000000034938 \caption:  a small bed is in a room with a backpack and a backpack.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 26/8946 [00:25<1:44:10,  1.43it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000099543 \caption:  a small black and black dog is holding a toy with a toy
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 27/8946 [00:26<1:39:03,  1.50it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000089253 \caption:  a group of men and women cutting a cake.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 28/8946 [00:27<1:40:25,  1.48it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000417303 \caption:  a group of people walking down a street in a city .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 29/8946 [00:27<1:41:06,  1.47it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000026501 \caption:  a red and red bus is traveling on a city street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 30/8946 [00:28<1:34:24,  1.57it/s]

torch.Size([1, 10, 197])

 file name:  000000509565 \caption:  a large elephant walking across a dirt field.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  0%|          | 31/8946 [00:29<1:39:27,  1.49it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000321679 \caption:  a car that is sitting on the side of a city street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 32/8946 [00:29<1:35:43,  1.55it/s]

torch.Size([1, 11, 197])

 file name:  280667538 \caption:   a man in a costume is riding a horse .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  0%|          | 33/8946 [00:30<1:30:56,  1.63it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000533889 \caption:  a large white bus driving down the street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 34/8946 [00:31<1:34:39,  1.57it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000369568 \caption:  a zebra stands next to a fence near a fence.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  0%|          | 35/8946 [00:31<1:39:27,  1.49it/s]

torch.Size([1, 14, 197])

 file name:  000000189351 \caption:  a parking meter is sitting on a sidewalk next to a tree.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  0%|          | 36/8946 [00:32<1:35:41,  1.55it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000418226 \caption:  a row of urinals are in a bathroom.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 37/8946 [00:33<1:38:09,  1.51it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000508202 \caption:  a plate of food with meat, vegetables, and vegetables.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 38/8946 [00:33<1:39:42,  1.49it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000571563 \caption:  a group of people on skis and a small building.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  0%|          | 39/8946 [00:34<1:45:46,  1.40it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  4589027891 \caption:   a man in a suit and a suit is walking down the street .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  0%|          | 40/8946 [00:35<1:47:55,  1.38it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000190334 \caption:  a teddy bear sitting on a desk next to a computer.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 41/8946 [00:35<1:44:22,  1.42it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000043404 \caption:  a clock tower in the middle of a city street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  0%|          | 42/8946 [00:36<1:42:04,  1.45it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000341219 \caption:  a little girl sitting at a table with a cake.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  0%|          | 43/8946 [00:37<1:45:01,  1.41it/s]

torch.Size([1, 14, 197])

 file name:  000000520655 \caption:  a black and white cat is sitting on a lush green field.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  0%|          | 44/8946 [00:38<1:42:30,  1.45it/s]

torch.Size([1, 12, 197])

 file name:  000000401528 \caption:  a large green vase is sitting on a table.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 45/8946 [00:38<1:43:18,  1.44it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000429160 \caption:  two beds in a room with a refrigerator and a shower.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 46/8946 [00:39<1:39:00,  1.50it/s]

torch.Size([1, 11, 197])

 file name:  000000128695 \caption:  a small sized cellular phone sitting on a sidewalk.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|          | 47/8946 [00:39<1:36:02,  1.54it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000488069 \caption:  a man is standing next to a brick building .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 48/8946 [00:40<1:33:54,  1.58it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000344368 \caption:  a cat is sitting on a car in a car
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 49/8946 [00:41<1:37:17,  1.52it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000465495 \caption:  a cat is sitting on a couch looking at the camera.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 50/8946 [00:41<1:32:37,  1.60it/s]


 file name:  000000096643 \caption:  a bathroom has a toilet and a sink.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  1%|          | 51/8946 [00:42<1:33:45,  1.58it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000380128 \caption:  a laptop computer sitting on top of a wooden desk.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 52/8946 [00:43<1:36:51,  1.53it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000383923 \caption:  a bunch of luggage bags of luggage sitting in the street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 53/8946 [00:43<1:31:44,  1.62it/s]

torch.Size([1, 10, 197])

 file name:  000000163192 \caption:  a ceramic plate is displayed on a plate.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 54/8946 [00:44<1:35:21,  1.55it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000361638 \caption:  a bathroom with a toilet, sink, and a shower.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 55/8946 [00:45<1:35:33,  1.55it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000237394 \caption:  a herd of animals in a field with a fence.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 56/8946 [00:45<1:36:01,  1.54it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000248956 \caption:  a skateboarder performs a trick on a ramp.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  1%|          | 57/8946 [00:46<1:31:20,  1.62it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000283382 \caption:  a kitchen with a sink and a sink 
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 58/8946 [00:46<1:37:42,  1.52it/s]

torch.Size([1, 14, 197])

 file name:  000000293304 \caption:  a kitchen with a lot of space and a lot of things 
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|          | 59/8946 [00:47<1:34:40,  1.56it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000203618 \caption:  a pile of fruit arranged together on a table.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 60/8946 [00:48<1:37:51,  1.51it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000373218 \caption:  a bear that is standing in a field with some trees.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 61/8946 [00:48<1:37:29,  1.52it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000324455 \caption:   a man is holding a skateboard and a yellow truck
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 62/8946 [00:49<1:39:36,  1.49it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000022478 \caption:  a room with a couch, television, and a television.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 63/8946 [00:50<1:38:39,  1.50it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000405334 \caption:  a pizza with cheese and cheese toppings on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 64/8946 [00:51<1:45:33,  1.40it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000564314 \caption:  a horse drawn with a blanket is seen as a man rides it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  1%|          | 65/8946 [00:51<1:38:02,  1.51it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  4510789820 \caption:   people walk on a street in a city .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 66/8946 [00:52<1:45:01,  1.41it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  4589027891 \caption:   a man in a suit and a suit is walking down the street .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  1%|          | 67/8946 [00:53<1:50:41,  1.34it/s]

torch.Size([1, 15, 197])

 file name:  14264287 \caption:  a young boy is sitting at a sink in front of a sink.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 68/8946 [00:54<1:51:40,  1.32it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000164810 \caption:  a person is skateboarding on a skate board in a park.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 69/8946 [00:54<1:47:44,  1.37it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000197408 \caption:  a view of a window that has a clock on it
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 70/8946 [00:55<1:49:52,  1.35it/s]

torch.Size([1, 14, 197])

 file name:  4878818161 \caption:   a man is walking on a street with a man behind him .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 71/8946 [00:56<1:51:03,  1.33it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000339815 \caption:  a dinner is sitting on a table with a dish of food.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 72/8946 [00:57<1:49:27,  1.35it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000580238 \caption:  a man is on a bench that is filled with people.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  1%|          | 73/8946 [00:57<1:51:16,  1.33it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000252617 \caption:  a cat that is sitting on a table next to a mirror.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  1%|          | 74/8946 [00:58<1:42:55,  1.44it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000234749 \caption:  a plate of food with vegetables and vegetables.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 75/8946 [00:59<1:54:29,  1.29it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  000000006393 \caption:  a woman with a large pair of silver horses is looking into a large mirror.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 76/8946 [01:00<1:54:28,  1.29it/s]

torch.Size([1, 14, 197])

 file name:  000000264909 \caption:  a sheep stands on a hillside with a stone behind it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|          | 77/8946 [01:00<1:47:01,  1.38it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000370819 \caption:  a close up of a pot filled with plants.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 78/8946 [01:01<1:44:21,  1.42it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000016520 \caption:  a sheep is standing in the middle of the grass.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 79/8946 [01:02<1:52:04,  1.32it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  000000171353 \caption:  a man standing next to a kitchen counter with a bunch of fresh foods.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 80/8946 [01:03<1:55:14,  1.28it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000342387 \caption:  a man in a kitchen preparing food for a customer in a kitchen.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  1%|          | 81/8946 [01:03<1:57:32,  1.26it/s]

torch.Size([1, 15, 197])

 file name:  000000151432 \caption:  a bird standing on a beach with a wave of water behind it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 82/8946 [01:04<1:59:03,  1.24it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000461883 \caption:  a man is holding a tennis racket on top of a tennis court.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])


  1%|          | 83/8946 [01:05<2:07:34,  1.16it/s]

torch.Size([1, 17, 197])
torch.Size([1, 18, 197])

 file name:  409001107 \caption:   a man in a white shirt is sitting on a street with a cat watching him .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 84/8946 [01:06<2:01:11,  1.22it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000366421 \caption:  a small bed with a guitar and a guitar on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 85/8946 [01:07<2:04:06,  1.19it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  000000042055 \caption:  a bunch of tables with colorful umbrellas in front of a store.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 86/8946 [01:08<1:56:27,  1.27it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000248956 \caption:  a skateboarder performs a trick on a ramp.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 87/8946 [01:08<1:53:20,  1.30it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000032176 \caption:  a vase with flowers in it is holding a flower.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 88/8946 [01:09<1:56:21,  1.27it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000266041 \caption:   a man in a black shirt is holding a basketball and a basketball .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 89/8946 [01:10<1:51:09,  1.33it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000138022 \caption:  a van is parked on the side of a street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 90/8946 [01:10<1:44:48,  1.41it/s]

torch.Size([1, 11, 197])

 file name:  000000207142 \caption:  a close up of a banana and some bananas.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|          | 91/8946 [01:11<1:40:37,  1.47it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  254169701 \caption:   a group of people are playing volleyball on the beach
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 92/8946 [01:12<1:47:42,  1.37it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000528047 \caption:   a man in a white shirt plays a game of frisbee.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|          | 93/8946 [01:12<1:43:10,  1.43it/s]

torch.Size([1, 11, 197])

 file name:  000000203618 \caption:  a pile of fruit arranged together on a table.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 94/8946 [01:13<1:42:06,  1.44it/s]

torch.Size([1, 12, 197])

 file name:  000000076081 \caption:  a group of people sitting at a table with food.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|          | 95/8946 [01:14<1:36:30,  1.53it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  3564148252 \caption:  a man riding a motorcycle on a street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|          | 96/8946 [01:14<1:39:51,  1.48it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000552870 \caption:  a sign sitting on a post in front of a house.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  1%|          | 97/8946 [01:15<1:44:47,  1.41it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  3643971203 \caption:   two men are walking down the sidewalk in front of a building.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  1%|          | 98/8946 [01:16<1:50:39,  1.33it/s]

torch.Size([1, 15, 197])

 file name:  000000467646 \caption:  a line of beach chairs are lined up in a row of water.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  1%|          | 99/8946 [01:17<1:57:53,  1.25it/s]

torch.Size([1, 16, 197])

 file name:  000000192079 \caption:  a woman is making food in a kitchen while she sits in her hands.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 100/8946 [01:18<1:52:22,  1.31it/s]

torch.Size([1, 12, 197])

 file name:  000000419723 \caption:  a fork and a fork are sitting on a plate.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 101/8946 [01:18<1:55:44,  1.27it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000342387 \caption:  a man in a kitchen preparing food for a customer in a kitchen.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  1%|          | 102/8946 [01:19<1:55:48,  1.27it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000001999 \caption:  a cat sitting on a table next to a bag of papers.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])
torch.Size([1, 20, 197])
torch.Size([1, 21, 197])
torch.Size([1, 22, 197])


  1%|          | 103/8946 [01:21<2:18:18,  1.07it/s]

torch.Size([1, 23, 197])

 file name:  3184738462 \caption:   man in a coat standing next to a sign standing next to a sign next to a sign on the wall.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|          | 104/8946 [01:21<2:09:01,  1.14it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000185360 \caption:  a small black and white dog is standing in a forest.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  1%|          | 105/8946 [01:22<2:07:35,  1.15it/s]

torch.Size([1, 15, 197])

 file name:  000000214475 \caption:  a close up of a cake with a floral floral floral floral floral.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  1%|          | 106/8946 [01:23<2:09:12,  1.14it/s]

torch.Size([1, 16, 197])

 file name:  2035511078 \caption:   three girls dressed in black and white are posing in a grassy field .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|          | 107/8946 [01:24<1:55:26,  1.28it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000446033 \caption:  a tall building that has a clock on it
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 108/8946 [01:25<2:03:14,  1.20it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  3317079939 \caption:  a group of people sitting on a couch with a large object on the floor.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  1%|          | 109/8946 [01:25<1:56:16,  1.27it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000256664 \caption:  a black and white plate with a burger and fries.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|          | 110/8946 [01:26<2:09:05,  1.14it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  2127566743 \caption:   a band of young men are playing guitar in front of a crowd .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  1%|          | 111/8946 [01:28<2:30:33,  1.02s/it]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000253971 \caption:  a group of people on the beach with surfboards on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  1%|▏         | 112/8946 [01:29<2:30:26,  1.02s/it]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000452737 \caption:  a man and woman posing for a photo.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  1%|▏         | 113/8946 [01:29<2:20:51,  1.05it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000332623 \caption:  a bride and groom are kissing in a ceremony .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|▏         | 114/8946 [01:30<2:14:06,  1.10it/s]

torch.Size([1, 14, 197])

 file name:  1220027979 \caption:  two children sit on a couch with a baby and a baby.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|▏         | 115/8946 [01:31<2:01:22,  1.21it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000403464 \caption:  a lot of cars are on a city street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  1%|▏         | 116/8946 [01:32<2:00:00,  1.23it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  4268234398 \caption:   a man in a suit is walking in front of a building .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|▏         | 117/8946 [01:32<1:56:54,  1.26it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000209292 \caption:  a small room with a large clock and a large fireplace.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|▏         | 118/8946 [01:33<1:54:48,  1.28it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000079831 \caption:  a small black and black cat is sitting on a porch.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  1%|▏         | 119/8946 [01:34<1:50:19,  1.33it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000396167 \caption:  a baseball player in a baseball uniform swinging a bat.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  1%|▏         | 120/8946 [01:35<1:50:13,  1.33it/s]

torch.Size([1, 13, 197])

 file name:  000000331907 \caption:  a man is walking in a field with a baseball bat.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  1%|▏         | 121/8946 [01:35<1:44:42,  1.40it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000141240 \caption:  a dog and cat are laying on a bed.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])


  1%|▏         | 122/8946 [01:36<1:58:29,  1.24it/s]

torch.Size([1, 17, 197])
torch.Size([1, 18, 197])

 file name:  330353975 \caption:   a man in a white shirt is sitting at a desk with a computer and a keyboard
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|▏         | 123/8946 [01:37<1:52:56,  1.30it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000411043 \caption:  a group of elephants standing next to a stone wall.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|▏         | 124/8946 [01:38<1:51:33,  1.32it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000361253 \caption:  a pair of scissors are sitting on a pile of paper.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|▏         | 125/8946 [01:38<1:48:15,  1.36it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000077750 \caption:  a plate topped with food and a plate with food.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|▏         | 126/8946 [01:39<1:45:40,  1.39it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000345590 \caption:  a sheep stands on a lush green field with grass.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|▏         | 127/8946 [01:40<1:43:44,  1.42it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000072794 \caption:  a person holding a video game controller in their hand.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  1%|▏         | 128/8946 [01:41<1:50:17,  1.33it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000370038 \caption:   a man and woman are sitting in a kitchen stove in a kitchen.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  1%|▏         | 129/8946 [01:42<1:59:53,  1.23it/s]

torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  3690431163 \caption:   a man with tattoos is in the middle of a woman in a police uniform .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  1%|▏         | 130/8946 [01:42<1:51:23,  1.32it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000323612 \caption:  a laptop computer sitting on top of a table.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|▏         | 131/8946 [01:43<1:48:11,  1.36it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000378778 \caption:  a baseball player is getting ready to hit a ball.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  1%|▏         | 132/8946 [01:43<1:43:05,  1.43it/s]

torch.Size([1, 11, 197])

 file name:  396179143 \caption:   a dog is jumping on a snow covered surface .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  1%|▏         | 133/8946 [01:44<1:52:24,  1.31it/s]

torch.Size([1, 16, 197])

 file name:  263216826 \caption:   a man and woman are sitting on a rock looking at the mountain face .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  1%|▏         | 134/8946 [01:45<1:53:48,  1.29it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000309222 \caption:  a close up view of a brush with a brush on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 135/8946 [01:46<1:52:40,  1.30it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000087113 \caption:  a man in a white shirt is walking down the street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  2%|▏         | 136/8946 [01:47<1:43:53,  1.41it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  3426789838 \caption:   a child jumping into a pool of water .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 137/8946 [01:47<1:40:48,  1.46it/s]

torch.Size([1, 11, 197])

 file name:  000000191893 \caption:  a train traveling on a track near a station.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 138/8946 [01:48<1:38:17,  1.49it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000060812 \caption:  a man riding a horse on a dirt track.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  2%|▏         | 139/8946 [01:48<1:39:25,  1.48it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000347648 \caption:  a person is holding a pot and holding a pot.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 140/8946 [01:49<1:43:17,  1.42it/s]

torch.Size([1, 13, 197])

 file name:  000000278287 \caption:  a large white bear swimming in the middle of a pool.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 141/8946 [01:50<1:47:32,  1.36it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000365833 \caption:   a person walks his cart to a horse cart on the beach .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 142/8946 [01:51<1:45:57,  1.38it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000056549 \caption:  an old fashioned truck is for sale at an event.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 143/8946 [01:52<1:57:44,  1.25it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  4690240999 \caption:   a woman is talking on her phone while another woman is talking on her phone.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])
torch.Size([1, 20, 197])


  2%|▏         | 144/8946 [01:53<2:21:26,  1.04it/s]

torch.Size([1, 21, 197])
torch.Size([1, 22, 197])
torch.Size([1, 23, 197])

 file name:  3909183873 \caption:   a man in a blue hat plays guitar and a guitar while another man in a blue hat plays a guitar .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 145/8946 [01:54<2:06:48,  1.16it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000429321 \caption:  a man riding a skateboard down a street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 146/8946 [01:54<1:56:26,  1.26it/s]

torch.Size([1, 11, 197])

 file name:  000000113525 \caption:  a group of small boats sit on a beach.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  2%|▏         | 147/8946 [01:55<2:07:26,  1.15it/s]

torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])

 file name:  000000411177 \caption:  a living room with a couch, a couch, a couch, and a couch.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 148/8946 [01:56<2:09:44,  1.13it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  000000456574 \caption:  a man in a striped shirt is sitting on a bench reading a book.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 149/8946 [01:57<1:56:15,  1.26it/s]

torch.Size([1, 10, 197])

 file name:  000000160137 \caption:  a street sign with a street sign on it
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  2%|▏         | 150/8946 [01:58<2:02:18,  1.20it/s]

torch.Size([1, 16, 197])

 file name:  000000572063 \caption:  a black and white view of a building with a window in the windows.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 151/8946 [01:58<1:53:29,  1.29it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000382447 \caption:  a wooden table with a wooden table and chairs.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 152/8946 [01:59<1:57:42,  1.25it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000320203 \caption:  a red and white street sign with a street sign in the background.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 153/8946 [02:00<1:52:57,  1.30it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000392105 \caption:  a train is on the tracks next to a building.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 154/8946 [02:01<1:54:42,  1.28it/s]

torch.Size([1, 14, 197])

 file name:  000000085852 \caption:  a girl holding a blue umbrella while holding up a blue umbrella.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 155/8946 [02:02<1:53:37,  1.29it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000106206 \caption:  a young boy is riding a skateboard on a blue surface
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 156/8946 [02:02<1:52:43,  1.30it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000532580 \caption:  a train that is going down the tracks at the station.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 157/8946 [02:03<1:52:00,  1.31it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000088208 \caption:  a woman in a red jacket is standing in a field.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 158/8946 [02:04<1:51:46,  1.31it/s]

torch.Size([1, 13, 197])

 file name:  000000201220 \caption:  a man and woman are smiling while driving in a car.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 159/8946 [02:05<1:54:15,  1.28it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000003461 \caption:  a group of people on skis are on a ski slope.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 160/8946 [02:05<1:53:29,  1.29it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000382557 \caption:  a group of people standing in front of a large sign.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 161/8946 [02:06<2:01:07,  1.21it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  000000393777 \caption:  children are playing a game on a television while children watch on the couch.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 162/8946 [02:07<2:03:20,  1.19it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000053677 \caption:  a small black and white photo of a small black and white airplane.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 163/8946 [02:08<1:54:45,  1.28it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000013414 \caption:  a parking meter on the side of a road.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 164/8946 [02:09<1:56:14,  1.26it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000032566 \caption:  a giraffe standing next to a giraffe in the grass.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  2%|▏         | 165/8946 [02:09<1:52:36,  1.30it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  4871416563 \caption:  a woman holding an umbrella while holding a large umbrella.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 166/8946 [02:10<1:52:15,  1.30it/s]

torch.Size([1, 13, 197])

 file name:  000000030791 \caption:  a teddy bear with a teddy bear on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  2%|▏         | 167/8946 [02:11<1:57:22,  1.25it/s]


 file name:  4470113445 \caption:   a man in a red shirt and a red shirt plays a guitar .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 168/8946 [02:12<1:47:56,  1.36it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  4519904608 \caption:  a subway train is going down the tracks.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 169/8946 [02:12<1:49:05,  1.34it/s]


 file name:  000000175180 \caption:  a man riding a snowboard down a snow covered slope.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 170/8946 [02:13<1:47:10,  1.36it/s]

torch.Size([1, 12, 197])

 file name:  000000346160 \caption:  a room with a bed, chair, and chair.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 171/8946 [02:14<1:48:37,  1.35it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000253825 \caption:  a vase with a vase of flowers on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 172/8946 [02:15<1:47:03,  1.37it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  2617812188 \caption:   a man is carving a sculpture with a white shirt .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 173/8946 [02:15<1:45:45,  1.38it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000025516 \caption:  a black bird standing on top of a suspended rope.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 174/8946 [02:16<1:45:07,  1.39it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000244735 \caption:  a case with a tree in the christmas tree.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000480894 \caption:  two bears are drinking water from a stream.

  2%|▏         | 175/8946 [02:17<1:39:21,  1.47it/s]


torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 176/8946 [02:17<1:38:01,  1.49it/s]


 file name:  000000192394 \caption:  a group of people are riding horses on horses.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])


  2%|▏         | 177/8946 [02:18<1:55:43,  1.26it/s]

torch.Size([1, 17, 197])
torch.Size([1, 18, 197])

 file name:  203146155 \caption:  a woman is holding a wine glass while a man is drinking wine at a table.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 178/8946 [02:19<1:49:21,  1.34it/s]

torch.Size([1, 11, 197])

 file name:  000000549932 \caption:  a woman is getting her luggage from her luggage.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000031073 

  2%|▏         | 179/8946 [02:20<1:55:10,  1.27it/s]

\caption:  a kite is flying in the air while a beautiful blue sky.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000304584 \caption:  

  2%|▏         | 180/8946 [02:21<1:49:10,  1.34it/s]

a crowd of people eating food in a market.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 181/8946 [02:21<1:47:37,  1.36it/s]

torch.Size([1, 12, 197])

 file name:  000000033991 \caption:  a woman and a baby are talking with an elephant.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 182/8946 [02:22<1:41:23,  1.44it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000137836 \caption:  a giraffe standing next to a tree.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 183/8946 [02:23<1:45:31,  1.38it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000135356 \caption:  a man washing his son in the sink in a sink.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 184/8946 [02:24<1:56:22,  1.25it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  4444147335 \caption:  a man wearing a shirt and a cell phone talking on his phone.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 185/8946 [02:24<1:55:38,  1.26it/s]

torch.Size([1, 13, 197])

 file name:  000000267242 \caption:  a group of zebra standing in a grassy plain.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 186/8946 [02:25<1:52:09,  1.30it/s]

torch.Size([1, 12, 197])

 file name:  000000143696 \caption:  a riverboat is in the foreground of a river.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 187/8946 [02:26<1:52:20,  1.30it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000319781 \caption:  a toddler standing on a carpet playing with a cell phone.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 188/8946 [02:27<1:52:44,  1.29it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000135978 \caption:  a giraffe standing on a tree branch in a forest.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 189/8946 [02:27<1:53:22,  1.29it/s]

torch.Size([1, 13, 197])

 file name:  481054596 \caption:  people walk down a sidewalk in front of a large building.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 190/8946 [02:28<1:51:18,  1.31it/s]

torch.Size([1, 12, 197])

 file name:  000000026162 \caption:  a stop sign on a boat near a large river.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 191/8946 [02:29<1:46:50,  1.37it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000015797 \caption:  a man wearing a snow suit and a snowboard
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])


  2%|▏         | 192/8946 [02:30<1:59:28,  1.22it/s]

torch.Size([1, 17, 197])

 file name:  4357061908 \caption:   a young man wearing a blue shirt and black shirt is working on his meal .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 193/8946 [02:30<1:46:57,  1.36it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])

 file name:  000000298726 \caption:  a man playing tennis on a tennis court
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 194/8946 [02:31<1:48:53,  1.34it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  3701699584 \caption:  a man is swinging a tennis ball at a tennis ball.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 195/8946 [02:32<1:52:33,  1.30it/s]

torch.Size([1, 14, 197])

 file name:  000000329175 \caption:  a man is jumping over a ramp on a skate board ramp.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 196/8946 [02:33<1:45:05,  1.39it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000297200 \caption:  a city street with a car and some cars
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 197/8946 [02:33<1:44:52,  1.39it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000579326 \caption:  a vegetable plant is growing vegetables in a vegetable field.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 198/8946 [02:34<1:36:31,  1.51it/s]


 file name:  2215797676 \caption:   a person is walking down a street .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 199/8946 [02:34<1:33:33,  1.56it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000457453 \caption:  a stop sign is sitting on a pole.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 200/8946 [02:35<1:38:42,  1.48it/s]


 file name:  000000330356 \caption:  a herd of giraffes are standing in a field.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])
torch.Size([1, 20, 197])
torch.Size([1, 21, 197])
torch.Size([1, 22, 197])
torch.Size([1, 23, 197])


  2%|▏         | 201/8946 [02:37<2:14:29,  1.08it/s]

torch.Size([1, 24, 197])
torch.Size([1, 25, 197])

 file name:  000000473261 \caption:   a woman in a white shirt is sitting at a computer while another woman in a white shirt is sitting at a table .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  2%|▏         | 202/8946 [02:38<2:18:20,  1.05it/s]

torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  2659046789 \caption:  a man is eating a chocolate cake with a lot of people in the background.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 203/8946 [02:39<2:18:24,  1.05it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  000000126263 \caption:  a black and white photo of a dog standing in front of a house.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  2%|▏         | 204/8946 [02:39<2:10:18,  1.12it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000310558 \caption:  a man is looking at a laptop on a metal gate.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 205/8946 [02:40<2:07:38,  1.14it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  2582390123 \caption:  two cows are grazing in a field next to two brown cows.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  2%|▏         | 206/8946 [02:41<1:55:07,  1.27it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000302489 \caption:  a white and white structure with a large umbrella
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  2%|▏         | 207/8946 [02:41<1:46:49,  1.36it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000085960 \caption:  two small boats sitting on a wooden floor.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 208/8946 [02:42<1:51:29,  1.31it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  1220027979 \caption:  two children sit on a couch with a baby and a baby.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 209/8946 [02:43<1:51:34,  1.31it/s]

torch.Size([1, 13, 197])

 file name:  000000419678 \caption:  a young child is looking at the contents of a refrigerator.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  2%|▏         | 210/8946 [02:44<2:01:59,  1.19it/s]

torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  000000326853 \caption:   a young boy in a black jacket holding a teddy bear in a window .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 211/8946 [02:45<1:58:52,  1.22it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  817654759 \caption:   a man is running in the middle of a running race .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  2%|▏         | 212/8946 [02:46<1:59:34,  1.22it/s]

torch.Size([1, 14, 197])

 file name:  000000156282 \caption:  a man is flying a kite while flying a kite.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  2%|▏         | 213/8946 [02:46<1:52:11,  1.30it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000425522 \caption:  a suitcase that is packed up in a storage.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  2%|▏         | 214/8946 [02:47<1:54:59,  1.27it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000313280 \caption:  a skateboarder is jumping his skateboard down a street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 215/8946 [02:48<1:56:37,  1.25it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000248391 \caption:  a group of people jumping a tree with a frisbee.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  2%|▏         | 216/8946 [02:49<1:47:22,  1.36it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000490908 \caption:  a vase full of flowers on a table
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 217/8946 [02:49<1:43:37,  1.40it/s]

torch.Size([1, 11, 197])

 file name:  000000265971 \caption:  a woman is playing tennis on a tennis court.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  2%|▏         | 218/8946 [02:50<1:56:42,  1.25it/s]

torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  4563139415 \caption:   a man is standing in front of a building that has a sign on it .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 219/8946 [02:51<1:55:08,  1.26it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000054924 \caption:  a variety of different types of different electronics including a phone.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])


  2%|▏         | 220/8946 [02:52<2:09:32,  1.12it/s]

torch.Size([1, 18, 197])
torch.Size([1, 19, 197])

 file name:  3182509597 \caption:  a man is sitting on a white boat in the water with a man in the background.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  2%|▏         | 221/8946 [02:53<1:58:34,  1.23it/s]

torch.Size([1, 11, 197])

 file name:  000000484494 \caption:  two cows are grazing in a field with cows.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])


  2%|▏         | 222/8946 [02:54<2:14:42,  1.08it/s]

torch.Size([1, 20, 197])

 file name:  000000533522 \caption:  a woman is sitting in front of a wine-covered wall with a wine rack on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  2%|▏         | 223/8946 [02:55<2:04:58,  1.16it/s]

torch.Size([1, 12, 197])

 file name:  000000040100 \caption:  a man is standing in the grass with a baseball.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 224/8946 [02:55<1:58:04,  1.23it/s]

torch.Size([1, 12, 197])

 file name:  000000068442 \caption:  a room with a bed and a backpack on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 225/8946 [02:56<1:55:49,  1.25it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000306627 \caption:  a man with a mustache and glasses is cutting a pie.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])


  3%|▎         | 226/8946 [02:57<2:04:32,  1.17it/s]

torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])

 file name:  000000393480 \caption:  a man wearing a red coat and coat is sitting on a snow covered slope.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 227/8946 [02:58<2:00:41,  1.20it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000176392 \caption:  a group of people are sitting around a table with food.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])


  3%|▎         | 228/8946 [02:59<2:08:05,  1.13it/s]

torch.Size([1, 17, 197])

 file name:  000000281855 \caption:   a man and woman are standing on a beach watching the sunset unfold the clouds .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 229/8946 [03:00<2:05:47,  1.15it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000146640 \caption:  a young man is holding a tennis racket and a tennis racket.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 230/8946 [03:00<2:01:25,  1.20it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000165064 \caption:  a young girl in a red dress standing in a water.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 231/8946 [03:01<1:58:13,  1.23it/s]

torch.Size([1, 13, 197])

 file name:  000000166047 \caption:  a man playing tennis on a tennis court with a ball.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 232/8946 [03:02<1:48:29,  1.34it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000033717 \caption:  a small child is playing with a toy.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 233/8946 [03:02<1:46:35,  1.36it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000330916 \caption:  a refrigerator with a microwave and a stainless steel refrigerator.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 234/8946 [03:03<1:48:23,  1.34it/s]

torch.Size([1, 13, 197])

 file name:  000000479531 \caption:  a sign that is on the tracks of a railroad track.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 235/8946 [03:04<1:44:12,  1.39it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000032960 \caption:  a red and red train traveling down a track.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  3%|▎         | 236/8946 [03:04<1:36:17,  1.51it/s]

torch.Size([1, 9, 197])

 file name:  000000311773 \caption:  a child is playing with a toy.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 237/8946 [03:05<1:35:21,  1.52it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000420610 \caption:  a table has a selection of food and vegetables.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])
torch.Size([1, 20, 197])


  3%|▎         | 238/8946 [03:06<2:05:45,  1.15it/s]

torch.Size([1, 21, 197])
torch.Size([1, 22, 197])
torch.Size([1, 23, 197])

 file name:  3184738462 \caption:   man in a coat standing next to a sign standing next to a sign next to a sign on the wall.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 239/8946 [03:07<1:58:33,  1.22it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000021374 \caption:  a red bus is on the side of a road.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  3%|▎         | 240/8946 [03:08<1:56:07,  1.25it/s]


 file name:  000000567997 \caption:   a black and white dog standing in a dirt covered spot .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  3%|▎         | 241/8946 [03:09<2:02:25,  1.19it/s]

torch.Size([1, 16, 197])

 file name:  000000291321 \caption:  a sign that says "the new" on the side of a subway.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 242/8946 [03:10<1:56:07,  1.25it/s]

torch.Size([1, 12, 197])

 file name:  000000126540 \caption:  a grey and white cat is looking out a window.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 243/8946 [03:10<1:51:46,  1.30it/s]

torch.Size([1, 12, 197])

 file name:  000000069501 \caption:  a train engine with a man standing on the tracks.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 244/8946 [03:11<1:48:47,  1.33it/s]

torch.Size([1, 12, 197])

 file name:  000000027675 \caption:  a bus is parked on the side of a road.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 245/8946 [03:11<1:38:48,  1.47it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])

 file name:  000000573184 \caption:  a plane is parked at a runway.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000042818 \caption:  

  3%|▎         | 246/8946 [03:12<1:34:42,  1.53it/s]

a man is skiing on a snowy surface.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 247/8946 [03:13<1:42:03,  1.42it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000342060 \caption:  a bench in front of a bench with a dog in it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 248/8946 [03:14<1:41:45,  1.42it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000565443 \caption:  a baseball player is swinging a bat during a game.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 249/8946 [03:14<1:41:50,  1.42it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000121503 \caption:  a green and white bus is parked on a street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 250/8946 [03:15<1:44:24,  1.39it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000267802 \caption:  a lot of people on a motorcycle on a busy street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 251/8946 [03:16<1:49:33,  1.32it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  4268234398 \caption:   a man in a suit is walking in front of a building .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 252/8946 [03:17<1:44:53,  1.38it/s]

torch.Size([1, 11, 197])

 file name:  7922678762 \caption:   a man is painting a mural of a man .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 253/8946 [03:17<1:44:24,  1.39it/s]

torch.Size([1, 12, 197])

 file name:  000000382171 \caption:   a black and white dog is running on a path.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 254/8946 [03:18<1:49:03,  1.33it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000164810 \caption:  a person is skateboarding on a skate board in a park.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 255/8946 [03:19<1:57:44,  1.23it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])

 file name:  445348023 \caption:   a man is standing on a chair in front of a large construction house .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 256/8946 [03:20<1:55:52,  1.25it/s]

torch.Size([1, 13, 197])

 file name:  2885387575 \caption:  a dog runs through the grass with a ball in its mouth
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 257/8946 [03:20<1:46:56,  1.35it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000127657 \caption:  a young boy is lying on a bed.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  3%|▎         | 258/8946 [03:21<1:53:10,  1.28it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000053677 \caption:  a small black and white photo of a small black and white airplane.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 259/8946 [03:22<1:50:20,  1.31it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000466519 \caption:  a woman is at a table with food and drinks.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  3%|▎         | 260/8946 [03:23<1:42:57,  1.41it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000387153 \caption:  a kitchen with a stove and a stove.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 261/8946 [03:23<1:45:50,  1.37it/s]

torch.Size([1, 13, 197])

 file name:  000000399932 \caption:  a bunch of people are flying kites on the beach.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 262/8946 [03:24<1:47:24,  1.35it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000532580 \caption:  a train that is going down the tracks at the station.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 263/8946 [03:25<1:43:15,  1.40it/s]

torch.Size([1, 11, 197])

 file name:  000000158588 \caption:  people walking down the street with umbrellas.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000511454 

  3%|▎         | 264/8946 [03:25<1:40:21,  1.44it/s]

\caption:  a man holding a red umbrella over a beach.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  3%|▎         | 265/8946 [03:26<1:48:45,  1.33it/s]


 file name:  000000184282 \caption:  a train is traveling on a track with people standing on the tracks.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 266/8946 [03:27<1:46:42,  1.36it/s]

torch.Size([1, 12, 197])

 file name:  000000053800 \caption:  a man and boy on the couch with a remote control
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 267/8946 [03:28<1:45:15,  1.37it/s]

torch.Size([1, 12, 197])

 file name:  000000004956 \caption:  a man is standing next to a elephant with horns.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])


  3%|▎         | 268/8946 [03:29<1:51:52,  1.29it/s]


 file name:  000000550392 \caption:  a person that is eating a chocolate cone with a smile on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 269/8946 [03:29<1:48:56,  1.33it/s]

torch.Size([1, 12, 197])

 file name:  000000161940 \caption:  a woman and a man are sitting on the bed.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 270/8946 [03:30<1:44:02,  1.39it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000483517 \caption:  a wooden table with a wooden table and chairs.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])


  3%|▎         | 271/8946 [03:31<1:56:08,  1.24it/s]

torch.Size([1, 17, 197])

 file name:  000000024100 \caption:  a man on a skateboarder is doing a trick on a cement ramp.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 272/8946 [03:32<1:56:53,  1.24it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  000000156282 \caption:  a man is flying a kite while flying a kite.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])


  3%|▎         | 273/8946 [03:32<1:44:44,  1.38it/s]

torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])

 file name:  000000127926 \caption:   a man is fixing up a machine .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])


  3%|▎         | 274/8946 [03:33<1:49:24,  1.32it/s]

torch.Size([1, 13, 197])
torch.Size([1, 14, 197])

 file name:  3627679667 \caption:  a person is riding a wave on top of a surfboard.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 275/8946 [03:34<1:47:26,  1.34it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000470995 \caption:  a woman and a child are playing with an elephant.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 276/8946 [03:34<1:42:59,  1.40it/s]

torch.Size([1, 11, 197])

 file name:  000000358247 \caption:  people are walking around in a store with bananas.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 277/8946 [03:35<1:42:54,  1.40it/s]

torch.Size([1, 12, 197])

 file name:  000000471839 \caption:  a pizza with a lot of toppings on it.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 278/8946 [03:36<1:37:55,  1.48it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000046171 \caption:  a teddy bear sitting on a window sill
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 279/8946 [03:36<1:36:43,  1.49it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  4726019778 \caption:   a woman is spraying a plant into a pot.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  3%|▎         | 280/8946 [03:37<1:33:09,  1.55it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000211302 \caption:  a refrigerator, refrigerator, and a refrigerator.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  3%|▎         | 281/8946 [03:38<1:30:43,  1.59it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000549459 \caption:  a kitchen with a stove and a window.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 282/8946 [03:38<1:31:31,  1.58it/s]

torch.Size([1, 11, 197])

 file name:  000000323964 \caption:  a girl smiles while holding a plate of cookies.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 283/8946 [03:39<1:37:04,  1.49it/s]

torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000280023 \caption:  a pizza with a knife on it on a cutting board.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  3%|▎         | 284/8946 [03:40<1:31:00,  1.59it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])

 file name:  1680126311 \caption:   two boys are playing soccer with a ball
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 285/8946 [03:40<1:31:42,  1.57it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000367881 \caption:  a large brown bear standing next to a tree.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 286/8946 [03:41<1:32:19,  1.56it/s]

torch.Size([1, 11, 197])

 file name:  000000195267 \caption:  a woman on a bike walks down a street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 287/8946 [03:42<1:35:26,  1.51it/s]

torch.Size([1, 12, 197])

 file name:  000000311904 \caption:  a woman in a white dress holding a tennis racket.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 288/8946 [03:42<1:37:32,  1.48it/s]

torch.Size([1, 12, 197])

 file name:  000000303743 \caption:  a green and green train engine is pulling a cart.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 289/8946 [03:43<1:38:44,  1.46it/s]

torch.Size([1, 12, 197])

 file name:  000000252280 \caption:  a busy street with a large number of parked bikes.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 290/8946 [03:44<1:34:38,  1.52it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000302710 \caption:  a couple of motorcycles parked on the street.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 291/8946 [03:44<1:36:46,  1.49it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000524651 \caption:  a couple of skiers are posing for a picture.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  3%|▎         | 292/8946 [03:45<1:45:49,  1.36it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000564443 \caption:   a man in a white shirt and black pants walks down a sidewalk .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 293/8946 [03:46<1:47:18,  1.34it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])

 file name:  000000160014 \caption:  a room with a couch, chair, and a television.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 294/8946 [03:47<1:45:20,  1.37it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  3368819708 \caption:  a plane is flying in the air behind a tree.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  3%|▎         | 295/8946 [03:47<1:46:55,  1.35it/s]


 file name:  000000266437 \caption:  a young man riding a skateboard down a wooden bench.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])


  3%|▎         | 296/8946 [03:48<1:45:23,  1.37it/s]

torch.Size([1, 12, 197])

 file name:  000000302141 \caption:  a large airplane is parked next to a large building.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 297/8946 [03:49<1:39:04,  1.45it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])

 file name:  000000302489 \caption:  a white and white structure with a large umbrella
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 298/8946 [03:49<1:37:21,  1.48it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000509267 \caption:  a man is skiing down a snow covered road.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 299/8946 [03:50<1:36:03,  1.50it/s]

torch.Size([1, 11, 197])

 file name:  7525845590 \caption:   a boy in a white shirt is playing golf .
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])


  3%|▎         | 300/8946 [03:51<1:35:03,  1.52it/s]

torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000314788 \caption:  a large plane is flying over a large city.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  3%|▎         | 301/8946 [03:51<1:37:01,  1.48it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000559136 \caption:  a snowboarder is jumping a steep snow course.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])


  3%|▎         | 302/8946 [03:52<1:36:06,  1.50it/s]

torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000022440 \caption:  a bus driving down a street in a city.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])


  3%|▎         | 303/8946 [03:53<1:46:02,  1.36it/s]

torch.Size([1, 14, 197])
torch.Size([1, 15, 197])

 file name:  000000492025 \caption:  a small child is holding a large teddy bear in the garden.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 304/8946 [03:54<1:44:50,  1.37it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000532030 \caption:  a herd of sheep are all standing on a road.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])


  3%|▎         | 305/8946 [03:54<1:36:40,  1.49it/s]

torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])

 file name:  000000445267 \caption:  a cat is sitting on a stuffed animal
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])


  3%|▎         | 306/8946 [03:55<1:35:34,  1.51it/s]

torch.Size([1, 10, 197])
torch.Size([1, 11, 197])

 file name:  000000199819 \caption:  a young boy holding a banana in his hand.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 307/8946 [03:55<1:37:31,  1.48it/s]

torch.Size([1, 11, 197])
torch.Size([1, 12, 197])

 file name:  000000405675 \caption:  a yellow fire hydrant that is in the grass.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])


  3%|▎         | 308/8946 [03:56<1:36:15,  1.50it/s]

torch.Size([1, 11, 197])

 file name:  000000145391 \caption:  a pair of scissors and a pair of scissors.
torch.Size([1, 1, 197])
torch.Size([1, 2, 197])
torch.Size([1, 3, 197])
torch.Size([1, 4, 197])
torch.Size([1, 5, 197])
torch.Size([1, 6, 197])
torch.Size([1, 7, 197])
torch.Size([1, 8, 197])
torch.Size([1, 9, 197])
torch.Size([1, 10, 197])
torch.Size([1, 11, 197])
torch.Size([1, 12, 197])
torch.Size([1, 13, 197])
torch.Size([1, 14, 197])
torch.Size([1, 15, 197])
torch.Size([1, 16, 197])
torch.Size([1, 17, 197])
torch.Size([1, 18, 197])
torch.Size([1, 19, 197])
torch.Size([1, 20, 197])
torch.Size([1, 21, 197])
torch.Size([1, 22, 197])
torch.Size([1, 23, 197])
torch.Size([1, 24, 197])
torch.Size([1, 25, 197])
torch.Size([1, 26, 197])
torch.Size([1, 27, 197])
torch.Size([1, 28, 197])
torch.Size([1, 29, 197])
torch.Size([1, 30, 197])
torch.Size([1, 31, 197])
torch.Size([1, 32, 197])
torch.Size([1, 33, 197])
torch.Size([1, 34, 197])
torch.Size([1, 35, 197])
torch.Size([1, 36, 197])
tor

  3%|▎         | 308/8946 [04:07<1:55:32,  1.25it/s]


KeyboardInterrupt: ignored

In [None]:
# model = Decoder(cfg).to(device)
# model.load_state_dict(torch.load('/content/trainable_weights', map_location=device), strict=False)

In [None]:
# evaluation_dict = {}
# for data in tqdm(val_loader):
#     img = data['img'].to(device)
#     file_name = data['filename']
#     start_token = torch.tensor([[50256]]).to(device)

#     for i in range(250):
#         with torch.no_grad():
#             pred = model(start_token, img)
#         out_token = pred.argmax(dim=2)[0][-1]
#         start_token = torch.cat((start_token, out_token.unsqueeze(0).unsqueeze(0)), dim=1)
#         end_token = torch.sum(start_token[0] == 50256).item()
#         if end_token == 2:
#             pred_token = start_token[start_token != 50256]
#             pred_token = pred_token.tolist()
#             pred_caption = encoding.decode(pred_token)
#             break

#     evaluation_dict[file_name[0]] = pred_caption
#     print('\n', 'file name: ', file_name[0], '\caption: ', evaluation_dict[file_name[0]])

In [None]:
# # Convert dictionary to JSON string
# json_string = json.dumps(evaluation_dict, indent=2)  # The indent parameter is optional and adds indentation for better readability

# # Write JSON string to a file
# with open('/content/drive/MyDrive/NTU_DLCV/Hw3/p2_output/large_epoch6_output.json', 'w') as json_file:
#     json_file.write(json_string)

# # Write JSON string to a file
# with open('output.json', 'w') as json_file:
#     json_file.write(json_string)

#### CIDEr & CLIPScore

In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
!pip install git+https://github.com/bckim92/language-evaluation.git
!python -c "import language_evaluation; language_evaluation.download('coco')"

In [None]:
!python3 /content/p2_evaluate.py --pred_file /content/output.json --annotation_file /content/hw3_data/p2_data/val.json --images_root /content/hw3_data/p2_data/images/val