<a href="https://colab.research.google.com/github/R12942159/NTU_DLCV/blob/Hw3/p2_Image_caption_large_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install timm
!pip install loralib

In [2]:
import os
import re
import math
import timm
import json
import torch
import collections
import numpy as np
import loralib as lora
import torch.nn.functional as F
import torchvision.transforms as tr

from PIL import Image
from tqdm import tqdm
from pathlib import Path
from torch import nn, Tensor
from torch.utils.data import DataLoader

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

Using: cuda


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Download dataset and unzip zip file.

In [None]:
!gdown 1SUiRrG6zQVtyrVSVh9hOBq5_fX-oV2Lh -O hw3_data.zip # 11rP6KmR5Qwjhx0rfag0b5TZGBTRuPtQR
!unzip /content/hw3_data.zip

#### Tokenizer ('<|endoftext|>', 50256) -> 250dim

In [6]:
class BPETokenizer:

    def __init__(self, encoder_file, vocab_file):
        with open(encoder_file, 'r', encoding='utf-8') as f:
            self.encoder = json.load(f)
        self.decoder = {v:k for k,v in self.encoder.items()}
        with open(vocab_file, 'r', encoding='utf-8') as f:
            vocab = f.read().split('\n')[1:-1]
        self.bpe_ranks = {tuple(line.split()): i for i, line in enumerate(vocab)}
        assert len(self.encoder) == 50257 and len(self.bpe_ranks) == 49999 # len(self.bpe_ranks) == 50000
        bs = list(range(33, 127)) + list(range(161, 256))
        xs = list(range(0, 33)) + list(range(127, 161))
        cs = bs[:] + [2**8 + i for i in range(len(xs))]
        self.byte_encoder = dict(zip(bs + xs, [chr(n) for n in cs]))
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}

    def encode(self, text, allowed_special=None):
        tokens = re.findall(r"""<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d| ?""" +
                            r"""\w+| ?\d+| ?[^\s\w\d]+|\s+(?!\S)|\s+""", text, re.UNICODE)
        def translate(token):
            if token == '<|endoftext|>':
                assert allowed_special and token in allowed_special
                return [token]
            word = tuple(''.join(self.byte_encoder[byte] for byte in token.encode('utf-8')))
            while len(word) != 1:
                pairs = set((word[i], word[i+1]) for i in range(len(word)-1))
                bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
                if bigram not in self.bpe_ranks:
                    break
                a, b = bigram
                new_word = []
                i = 0
                while i < len(word):
                    j = word.index(a, i) if a in word[i:] else len(word)
                    new_word.extend(word[i:j])
                    i = j
                    if i < len(word):
                        j = 2 if i < len(word)-1 and word[i] == a and word[i+1] == b else 1
                        new_word.append(a+b if j == 2 else word[i])
                        i += j
                word = tuple(new_word)
            return word
        return [self.encoder[_] for token in tokens for _ in translate(token)]

    def decode(self, tokens):
        tokens = [self.decoder[token] for token in tokens]
        buffer = bytearray([self.byte_decoder[c] for c in ''.join(tokens)])
        return buffer.decode('utf-8', errors='replace')

In [7]:
encoding = BPETokenizer('/content/encoder.json', '/content/vocab.bpe')
# prompt = 'a kitchen with a sink and many cooking machines and a pot of food'

# text_embedding_len = 250

# context = encoding.encode(prompt)
# context = [50256] + context + [50256]*(text_embedding_len - len(context) - 1)
# # context
# encoding.decode(context)

#### Define function

In [8]:
def json_load(json_path: str):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [9]:
def caption_with_id(json_path: str) -> list:
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    data = [{'caption': row['caption'], 'image_id': row['image_id']} for row in json_data['annotations']]
    return data

In [10]:
def id2file_name(json_path: str) -> dict:
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    data = {row['id']: row['file_name'] for row in json_data['images']}
    return data

In [11]:
encoder_joson_path = '/content/encoder.json'
vocab_bpe_path = '/content/vocab.bpe'
def collate_fn(batch, tokenizer=BPETokenizer(encoder_joson_path, vocab_bpe_path)):
    # Get the individual elements of the batch
    images = [item['img'] for item in batch]
    captions = [item['caption'] for item in batch]
    filenames = [item['filename'] for item in batch]

    # Tokenize captions
    tokenized_captions = [tokenizer.encode(caption) for caption in captions]

    # Pad the vector length into stop token to dimension 250
    text_len = 250 # text_embedding_len
    tokenized_captions_train = [
        [50256] + caption + [50256] * (text_len - len(caption) - 1) for caption in tokenized_captions
    ]
    tokenized_captions_inf = [
        caption + [50256] + [-100] * (text_len - len(caption) - 1) for caption in tokenized_captions
    ]

    # Convert tokenized captions to PyTorch tensors
    tokenized_captions_train = [torch.tensor(caption) for caption in tokenized_captions_train]
    tokenized_captions_inf = [torch.tensor(caption) for caption in tokenized_captions_inf]

    # Create a new batch with tokenized captions
    tokenized_batch = {
        'img': torch.stack(images, dim=0),
        'tokenized_captions_train': torch.stack(tokenized_captions_train, dim=0),
        'filename': filenames,
        'tokenized_captions_inf': torch.stack(tokenized_captions_inf, dim=0),
    }

    return tokenized_batch

#### Build Dataset

In [12]:
class ImgCaptionDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, json_path, transform) -> None:
        super(ImgCaptionDataset, self).__init__()
        self.img_dir = img_dir
        self.transform = transform

        # Connect caption -> image_id -> file_name
        self.caption_with_id = caption_with_id(json_path)
        self.id2file_name = id2file_name(json_path)
    def __len__(self) -> int:
        return len(self.caption_with_id)

    def __getitem__(self, idx):
        caption_id = self.caption_with_id[idx]
        file_name = self.id2file_name[caption_id['image_id']]
        img = Image.open(os.path.join(self.img_dir, file_name)).convert('RGB')
        img = self.transform(img)
        return {'img': img, 'caption': caption_id['caption'], 'filename': os.path.splitext(file_name)[0]}

#### Build Dataloader

In [13]:
train_ds = ImgCaptionDataset(
    img_dir='/content/hw3_data/p2_data/images/train',
    json_path='/content/hw3_data/p2_data/train.json',
    transform=tr.Compose([
        tr.Resize(224),
        tr.CenterCrop(224),
        tr.ToTensor(),
        tr.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
)
val_ds = ImgCaptionDataset(
    img_dir='/content/hw3_data/p2_data/images/val',
    json_path='/content/hw3_data/p2_data/val.json',
    transform=tr.Compose([
        tr.Resize(224),
        tr.CenterCrop(224),
        tr.ToTensor(),
        tr.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
)

train_loader = DataLoader(
    train_ds,
    batch_size=16,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=4,
)
val_loader = DataLoader(
    val_ds,
    batch_size=1,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=4,
)

#### Config

In [14]:
class Config:

    def __init__(self, checkpoint=None):
        self.n_layer = 12
        self.n_head = 12
        self.n_embd = 768
        self.vocab_size = 50257
        self.block_size = 1024
        self.checkpoint = checkpoint

In [15]:
cfg = Config(checkpoint='/content/hw3_data/p2_data/decoder_model.bin')

#### decoder

In [16]:
class Attention(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.c_attn = lora.Linear(cfg.n_embd, 3 * cfg.n_embd, r=4)
        self.c_proj = nn.Linear(cfg.n_embd, cfg.n_embd)
        self.n_head = cfg.n_head
        self.n_embd = cfg.n_embd
        size = cfg.block_size
        self.register_buffer('bias', torch.tril(torch.ones(size, size)).view(1, 1, size, size))

    def forward(self, x):
        B, T, C = x.size() # batch, context, embedding
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        return self.c_proj((att @ v).transpose(1, 2).contiguous().view(B, T, C))

class CrossAttention(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.multihead_attn = nn.MultiheadAttention(cfg.n_embd, cfg.n_head, batch_first=True)

    def forward(self, query, encoder_out):
        """
        Q is the source from the decoder, K, V are the sources from the encoder.
        Q: (N, L, Eq), where L is the target embedding dim, Eq is embed_dim and batch_first=True.
        {K, V}: (N, L, E{k,v}), where L is the source embedding dim, E{k,v} is {k,v}_dim and batch_first=True.
        """
        attn_output, attn_output_weights = self.multihead_attn(query, encoder_out, encoder_out)
        return attn_output #, attn_output_weights

class Block(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.ln_1 = nn.LayerNorm(cfg.n_embd)
        self.ln_2 = nn.LayerNorm(cfg.n_embd) # add
        self.ln_3 = nn.LayerNorm(cfg.n_embd)
        self.attn = Attention(cfg)
        self.crs_attn = CrossAttention(cfg) # add
        self.mlp = nn.Sequential(collections.OrderedDict([
            ('c_fc', nn.Linear(cfg.n_embd, 4 * cfg.n_embd)),
            ('act', nn.GELU(approximate='tanh')),
            ('c_proj', nn.Linear(4 * cfg.n_embd, cfg.n_embd))
        ]))

    def forward(self, x, encoder_out) -> Tensor: # add
        x = x + self.attn(self.ln_1(x))
        cross_x = self.crs_attn(self.ln_2(x), self.ln_2(encoder_out)) # add #, weights
        x = cross_x + x
        x = x + self.mlp(self.ln_3(x))
        return x #, weights

class Decoder(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.block_size = cfg.block_size
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(cfg.vocab_size, cfg.n_embd), # 文字投影
            wpe = nn.Embedding(cfg.block_size, cfg.n_embd), # position
            h = nn.Sequential(*[Block(cfg) for _ in range(cfg.n_layer)]), # Nx
            ln_f = nn.LayerNorm(cfg.n_embd)
        ))
        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        # timm's ViT encoder (vit_base_patch16_224_in21k, , vit_huge_patch14_224_in21k)
        self.encoder = timm.create_model('vit_large_patch16_224_in21k', pretrained=True)
        self.linear = nn.Linear(1024, cfg.n_embd) # [16, 197, 1024]
        # load checkpoint
        if self.cfg.checkpoint is not None:
            state_dict = torch.load(self.cfg.checkpoint)
            transposed = [ '.c_attn.weight', '.c_fc.weight', '.c_proj.weight' ]
            for key, value in state_dict.items():
                if any(key.endswith(w) for w in transposed):
                    state_dict[key] = value.t()
            self.transformer.load_state_dict(state_dict, strict=False)

    def forward(self, x: Tensor, img: Tensor) -> Tensor: # add
        x = torch.narrow(x, 1, 0, min(x.size(1), self.block_size))
        pos = torch.arange(x.size()[1], dtype=torch.long, device=x.device).unsqueeze(0)
        x = self.transformer.wte(x) + self.transformer.wpe(pos)
        with torch.no_grad():
            encoder_out = self.encoder.forward_features(img)
        for block in self.transformer.h:
            x = block(x, self.linear(encoder_out)) #, weights
        x = self.lm_head(self.transformer.ln_f(x)) # add
        return x #, weights

#### Decoding test

In [17]:
# decoder = Decoder(cfg)

In [18]:
# for batch in train_loader:
#     img = batch['img']
#     tokenized_captions_train = batch['tokenized_captions_train']
#     tokenized_captions_inf = batch['tokenized_captions_inf']
#     break

In [19]:
# pred = decoder(tokenized_captions_train, img)

In [20]:
# pred.size(), tokenized_captions_train.size(), img.size(), tokenized_captions_inf.size()

In [21]:
# encoding.decode(pred[0][0].argmax(dim=1).tolist())

In [22]:
# loss_fn = nn.CrossEntropyLoss() # ignore_index=50256

# pred = pred[0].reshape(-1, 50257)
# tokenized_captions_inf = tokenized_captions_inf.reshape(-1)
# loss_fn(pred, tokenized_captions_inf)

##### Freeze parameters

In [None]:
# model = Decoder(cfg).to(device)

# # Freeze parameters
# for name, param in model.named_parameters():
#     param.requires_grad=False

# # Unfreeze some parameters
# for i in range(12):
#     model.transformer.h[i].ln_2.weight.requires_grad = True
#     model.transformer.h[i].ln_2.bias.requires_grad = True
#     model.transformer.h[i].attn.c_attn.lora_A.requires_grad = True
#     model.transformer.h[i].attn.c_attn.lora_B.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.in_proj_weight.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.in_proj_bias.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.out_proj.weight.requires_grad = True
#     model.transformer.h[i].crs_attn.multihead_attn.out_proj.bias.requires_grad = True
# model.linear.weight.requires_grad = True
# model.linear.bias.requires_grad = True

# trainable_weights = [name for name, param in model.named_parameters() if param.requires_grad == True]
# # list for True
# for name, param in model.named_parameters():
#     print(f"{name}: {param.requires_grad}")

#### Training

In [24]:
# def training(dataloader, model, loss_fn, optimizer):

#     size = len(dataloader.dataset) # number of samples
#     num_batches = len(dataloader) # batches per epoch
#     epoch_loss = 0

#     model.train() # to training mode
#     for batch_i, data in enumerate(tqdm(dataloader)):
#         data['img'] = data['img'].to(device, non_blocking=True)
#         data['tokenized_captions_train'] = data['tokenized_captions_train'].to(device, non_blocking=True)
#         data['tokenized_captions_inf'] = data['tokenized_captions_inf'].to(device, non_blocking=True)

#         # zero the parameter gradients
#         optimizer.zero_grad()

#         # Compute prediction loss
#         pred = model(data['tokenized_captions_train'], data['img'])
#         # reshape to (B, C)
#         data['tokenized_captions_inf'] = data['tokenized_captions_inf'].reshape(-1)
#         pred = pred.reshape(-1, 50257)
#         loss = loss_fn(pred, data['tokenized_captions_inf']) # tokenized captions inf

#         # Optimization by gradients
#         loss.backward() # backpropagation to compute gradients
#         optimizer.step() # update model params

#         # write to logs
#         epoch_loss += loss.item() # tensor -> python value
#     return epoch_loss/num_batches

In [None]:
# EPOCHS = 8
# loss_fn = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# # logs
# logs = {
#     'train_loss': []
# }

# for epoch in tqdm(range(EPOCHS)):
#     train_loss = training(train_loader, model, loss_fn, optimizer)

#     print(f'EPOCH: {epoch:04d} \train_loss: {train_loss:.4f}')

#     logs['train_loss'].append(train_loss)

#     # Save model
#     save_weights = {k: v for k, v in model.state_dict().items() if k in trainable_weights}
#     torch.save(save_weights, f'/content/drive/MyDrive/NTU_DLCV/Hw3/p2_ckpt_large_LoRA/trainable_weights_LoRA_epoch{epoch}_{train_loss:.4f}.pth')
#     print('---------- Model Save ----------')

#### Check the model params less than 35M

In [18]:
!gdown 1-A5b7or2nZfLUXxKJ66q9QdupiZLRfvf -O LoRA_trainable_weights

Downloading...
From: https://drive.google.com/uc?id=1-A5b7or2nZfLUXxKJ66q9QdupiZLRfvf
To: /content/LoRA_trainable_weights
100% 117M/117M [00:02<00:00, 41.0MB/s]


In [20]:
model = Decoder(cfg)
# Freeze parameters
for name, param in model.named_parameters():
    param.requires_grad=False
    # print(f"{name}: {param.requires_grad}")
# Unfreeze some parameters
for i in range(12):
    model.transformer.h[i].ln_2.weight.requires_grad = True
    model.transformer.h[i].ln_2.bias.requires_grad = True
    model.transformer.h[i].attn.c_attn.lora_A.requires_grad = True
    model.transformer.h[i].attn.c_attn.lora_B.requires_grad = True
    model.transformer.h[i].crs_attn.multihead_attn.in_proj_weight.requires_grad = True
    model.transformer.h[i].crs_attn.multihead_attn.in_proj_bias.requires_grad = True
    model.transformer.h[i].crs_attn.multihead_attn.out_proj.weight.requires_grad = True
    model.transformer.h[i].crs_attn.multihead_attn.out_proj.bias.requires_grad = True
model.linear.weight.requires_grad = True
model.linear.bias.requires_grad = True

model.safetensors:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

In [22]:
model.load_state_dict(torch.load('/content/LoRA_trainable_weights', map_location=device), strict=False)
print('Total params: ', sum(params.numel() for params in model.parameters() if params.requires_grad))

Total params:  29301504


#### inference

In [None]:
!gdown 1-A5b7or2nZfLUXxKJ66q9QdupiZLRfvf -O trainable_weights4 # epoch4
!gdown 1-7X5RmB1vBgglTG4Hnzt7Xrk5z6NSQCo -O trainable_weights3 # epoch3
!gdown 1-2aI-6St6eOompVzT5OrIcbELkgTSSIl -O trainable_weights2 # epoch2
!gdown 1-1siArvAkGcXUke88C4X9WOcw6QboFlZ -O trainable_weights1 # epoch1

In [None]:
for i in range(4, 0, -1):
    model = Decoder(cfg).to(device)
    model.load_state_dict(torch.load(f'/content/trainable_weights{i}', map_location=device), strict=False)
    print(f'---------- trainable weights {i} is using ----------')

    evaluation_dict = {}
    for data in tqdm(val_loader):
        img = data['img'].to(device)
        file_name = data['filename']
        start_token = torch.tensor([[50256]]).to(device)

        for j in range(250):
            with torch.no_grad():
                pred = model(start_token, img)
                # print(weights.size())

            out_token = pred.argmax(dim=2)[0][-1]
            start_token = torch.cat((start_token, out_token.unsqueeze(0).unsqueeze(0)), dim=1)
            end_token = torch.sum(start_token[0] == 50256).item()
            if end_token == 2:
                pred_token = start_token[start_token != 50256]
                pred_token = pred_token.tolist()
                pred_caption = encoding.decode(pred_token)
                break

        evaluation_dict[file_name[0]] = pred_caption
        print('\n', 'file name: ', file_name[0], '\caption: ', evaluation_dict[file_name[0]])

    json_string = json.dumps(evaluation_dict, indent=2)  # The indent parameter is optional and adds indentation for better readability
    with open(f'/content/drive/MyDrive/NTU_DLCV/Hw3/p2_ckpt_large_LoRA/LoRA_large_epoch{i}_output.json', 'w') as json_file:
        json_file.write(json_string)
    print(f'---------- Epoch{i} large params Saved ----------')
    with open('output.json', 'w') as json_file:
        json_file.write(json_string)

---------- trainable weights 4 is using ----------


  0%|          | 1/8946 [00:00<2:23:23,  1.04it/s]


 file name:  000000402095 \caption:  a man holding a cell phone in his hands.


  0%|          | 2/8946 [00:01<2:04:21,  1.20it/s]


 file name:  000000206587 \caption:  a blue and blue train is flying through a grassy area.


  0%|          | 3/8946 [00:02<1:54:47,  1.30it/s]


 file name:  000000484835 \caption:  a bathroom with a sink and a sink in a bathroom.


  0%|          | 4/8946 [00:03<1:53:42,  1.31it/s]


 file name:  000000473261 \caption:   a young girl in a blue shirt is sitting at a computer .


  0%|          | 5/8946 [00:03<1:52:54,  1.32it/s]


 file name:  000000162855 \caption:  a young man is throwing a frisbee in a field.


  0%|          | 6/8946 [00:04<1:52:36,  1.32it/s]


 file name:  3052038928 \caption:  a couple of people are playing wii while playing video games.


  0%|          | 7/8946 [00:05<1:44:19,  1.43it/s]


 file name:  000000189295 \caption:  a traffic light is shown on a street light.


  0%|          | 8/8946 [00:05<1:28:52,  1.68it/s]


 file name:  4439517165 \caption:   two people walk through a building


  0%|          | 9/8946 [00:06<1:33:27,  1.59it/s]


 file name:  000000509620 \caption:  a double decker bus is parked next to a bus.


  0%|          | 10/8946 [00:07<1:36:43,  1.54it/s]


 file name:  000000289263 \caption:  a woman in a bikini bikini bikini holding a surfboard.


  0%|          | 11/8946 [00:07<1:51:41,  1.33it/s]


 file name:  2069279767 \caption:   a man and a woman sitting on a red and red chair on a red wall .


  0%|          | 12/8946 [00:08<1:51:58,  1.33it/s]


 file name:  000000333621 \caption:  a laptop computer sitting on a desk with a laptop on it.


  0%|          | 13/8946 [00:09<1:47:15,  1.39it/s]


 file name:  000000005757 \caption:  a school bus is parked in front of a bus.


  0%|          | 14/8946 [00:09<1:41:28,  1.47it/s]


 file name:  000000141207 \caption:  a blue and white bus parked outside a building.


  0%|          | 15/8946 [00:10<1:37:44,  1.52it/s]


 file name:  2617812188 \caption:   a woman is painting a sculpture of a sculpture .


  0%|          | 16/8946 [00:11<1:34:58,  1.57it/s]


 file name:  000000092342 \caption:  a man is playing tennis on a tennis court.


  0%|          | 17/8946 [00:11<1:35:29,  1.56it/s]


 file name:  000000383605 \caption:  a baseball player swinging a bat at a baseball game.


  0%|          | 18/8946 [00:12<1:38:22,  1.51it/s]


 file name:  000000394132 \caption:  a teddy bear with a hat sitting on a table.


  0%|          | 19/8946 [00:13<1:37:33,  1.53it/s]


 file name:  000000311904 \caption:  a woman in a bikini top holding a tennis racket.


  0%|          | 20/8946 [00:13<1:34:34,  1.57it/s]


 file name:  000000040100 \caption:  a man holding a baseball bat in his hands.


  0%|          | 21/8946 [00:14<1:35:05,  1.56it/s]


 file name:  4727583716 \caption:   a group of men are standing in a rail yard .


  0%|          | 22/8946 [00:15<1:40:11,  1.48it/s]


 file name:  000000062053 \caption:  a laptop computer, keyboard, and a keyboard on a desk.


  0%|          | 23/8946 [00:15<1:36:33,  1.54it/s]


 file name:  000000030151 \caption:  a vase with a statue of flowers on it


  0%|          | 24/8946 [00:16<1:34:10,  1.58it/s]


 file name:  000000382171 \caption:  a black and white dog running on a path.


  0%|          | 25/8946 [00:17<1:37:06,  1.53it/s]


 file name:  3787451302 \caption:   a man in a green shirt is holding a wooden axe .


  0%|          | 26/8946 [00:17<1:44:11,  1.43it/s]


 file name:  000000306627 \caption:  a man with a mustache and glasses sitting in front of a cake.


  0%|          | 27/8946 [00:18<1:39:32,  1.49it/s]


 file name:  000000156202 \caption:  a man is taking a shot of a man.


  0%|          | 28/8946 [00:19<1:45:57,  1.40it/s]


 file name:  177222949 \caption:  a man in a white shirt and jeans is walking down the street.


  0%|          | 29/8946 [00:19<1:31:04,  1.63it/s]


 file name:  4439517165 \caption:   two people walk through a building


  0%|          | 30/8946 [00:20<1:37:28,  1.52it/s]


 file name:  000000406452 \caption:  a pair of black and white cell phones sitting on a table.


  0%|          | 31/8946 [00:21<1:34:41,  1.57it/s]


 file name:  000000182736 \caption:  a large clock tower with a large clock tower.


  0%|          | 32/8946 [00:21<1:37:44,  1.52it/s]


 file name:  000000480797 \caption:  a boat with a bicycle and a bicycle on the water.


  0%|          | 33/8946 [00:22<1:44:43,  1.42it/s]


 file name:  000000074411 \caption:  a fire truck with a fire engine on the side of the road.


  0%|          | 34/8946 [00:23<1:37:22,  1.53it/s]


 file name:  000000403464 \caption:  a busy city street with cars and pedestrians.


  0%|          | 35/8946 [00:23<1:32:12,  1.61it/s]


 file name:  3909183873 \caption:   two musicians are playing guitar and playing guitar .


  0%|          | 36/8946 [00:24<1:31:09,  1.63it/s]


 file name:  000000459465 \caption:  a goat is standing next to a wooden fence.


  0%|          | 37/8946 [00:25<1:40:04,  1.48it/s]


 file name:  8170441792 \caption:   a man in a black shirt and jeans jumps into a rock formation .


  0%|          | 38/8946 [00:25<1:38:59,  1.50it/s]


 file name:  2173061319 \caption:   a person in a blue skis in the snow .


  0%|          | 39/8946 [00:26<1:43:28,  1.43it/s]


 file name:  000000091996 \caption:   a man in a blue shirt is standing in a living room .


  0%|          | 40/8946 [00:27<1:43:58,  1.43it/s]


 file name:  000000022051 \caption:  a group of young boys playing tennis on a tennis court.


  0%|          | 41/8946 [00:27<1:49:16,  1.36it/s]


 file name:  000000181850 \caption:  a person is holding a fork in front of a plate with food.


  0%|          | 42/8946 [00:28<1:48:03,  1.37it/s]


 file name:  000000518194 \caption:  a slice of pizza is on a plate with a fork.


  0%|          | 43/8946 [00:29<1:42:24,  1.45it/s]


 file name:  000000573483 \caption:  a clock on a wall with a clock on it


  0%|          | 44/8946 [00:30<1:45:45,  1.40it/s]


 file name:  000000116557 \caption:  a person holding a hot dog with a hot dog on it.


  1%|          | 45/8946 [00:30<1:43:20,  1.44it/s]


 file name:  000000296871 \caption:  a cat sitting on a table with a beer bottle.


  1%|          | 46/8946 [00:31<1:44:01,  1.43it/s]


 file name:  000000137573 \caption:  a table with a computer and a computer on the floor.


  1%|          | 47/8946 [00:32<1:39:31,  1.49it/s]


 file name:  4563139415 \caption:  a woman is standing in front of a building .


  1%|          | 48/8946 [00:32<1:39:08,  1.50it/s]


 file name:  000000217495 \caption:  a person with a dog standing next to a person.


  1%|          | 49/8946 [00:33<1:43:34,  1.43it/s]


 file name:  000000082312 \caption:  a woman in a red hat is standing next to a bicycle.


  1%|          | 50/8946 [00:34<1:39:15,  1.49it/s]


 file name:  000000495615 \caption:  a person dressed in a costume with a red ribbon


  1%|          | 51/8946 [00:35<1:56:00,  1.28it/s]


 file name:  4209480025 \caption:   a man in a black hat is walking down a street with a snow covered in snow .


  1%|          | 52/8946 [00:36<2:10:23,  1.14it/s]


 file name:  2815256108 \caption:   a man in a white shirt and white shirt standing in front of a yellow and white shop .


  1%|          | 53/8946 [00:36<2:00:31,  1.23it/s]


 file name:  000000459347 \caption:   a man flying a kite on a grassy hill


  1%|          | 54/8946 [00:37<2:03:32,  1.20it/s]


 file name:  000000014475 \caption:  a teddy bear is sitting on a table with a large stuffed animal.


  1%|          | 55/8946 [00:38<2:03:17,  1.20it/s]


 file name:  177222949 \caption:  a man in a white shirt and jeans is walking down the street.


  1%|          | 56/8946 [00:39<1:58:21,  1.25it/s]


 file name:  000000205757 \caption:  a close up of a piece of bread and a sandwich.


  1%|          | 57/8946 [00:39<1:37:34,  1.52it/s]


 file name:  000000051530 \caption:   a man with a hat


  1%|          | 58/8946 [00:40<1:37:43,  1.52it/s]


 file name:  000000240655 \caption:  a snowboarder in the snow on a ramp.


  1%|          | 59/8946 [00:41<1:40:30,  1.47it/s]


 file name:  000000113897 \caption:  a white toilet sitting in a bathroom next to a toilet.


  1%|          | 60/8946 [00:41<1:30:01,  1.65it/s]


 file name:  4014757090 \caption:   two men walk down the street .


  1%|          | 61/8946 [00:41<1:25:03,  1.74it/s]


 file name:  000000480223 \caption:  a white plane flying through the sky.


  1%|          | 62/8946 [00:42<1:36:36,  1.53it/s]


 file name:  000000559768 \caption:  a sandwich and a sandwich on a plate with a drink in it.


  1%|          | 63/8946 [00:43<1:39:33,  1.49it/s]


 file name:  000000373792 \caption:  a herd of sheep are in a field with a fence.


  1%|          | 64/8946 [00:44<1:41:40,  1.46it/s]


 file name:  000000031865 \caption:  two cats looking out of a window looking out the window.


  1%|          | 65/8946 [00:45<1:48:13,  1.37it/s]


 file name:  000000382557 \caption:  a group of people standing around a table with a large framed picture.


  1%|          | 66/8946 [00:45<1:45:26,  1.40it/s]


 file name:  000000536252 \caption:  an elephant is walking through the foliage in the jungle.


  1%|          | 67/8946 [00:46<1:46:00,  1.40it/s]


 file name:  000000334939 \caption:  a plate of food is on a plate with a salad.


  1%|          | 68/8946 [00:47<1:41:22,  1.46it/s]


 file name:  000000417303 \caption:   people walk down a busy street with people walking .


  1%|          | 69/8946 [00:47<1:48:01,  1.37it/s]


 file name:  000000284144 \caption:  a person is sitting in front of a tv in a large room.


  1%|          | 70/8946 [00:48<1:42:40,  1.44it/s]


 file name:  000000416165 \caption:  a building with a clock tower and a tower.


  1%|          | 71/8946 [00:49<1:46:34,  1.39it/s]


 file name:  000000055389 \caption:  a person holding a teddy bear with a small pink ribbon.


  1%|          | 72/8946 [00:49<1:44:16,  1.42it/s]


 file name:  000000310136 \caption:  a baseball player is throwing a baseball on the mound.


  1%|          | 73/8946 [00:50<1:45:09,  1.41it/s]


 file name:  000000549168 \caption:  a plate of food is on a plate with a fork.


  1%|          | 74/8946 [00:51<1:45:46,  1.40it/s]


 file name:  000000373923 \caption:  a woman in blue skiing gear is skiing on the snow.


  1%|          | 75/8946 [00:52<1:48:50,  1.36it/s]


 file name:  211402278 \caption:   a man and a woman are standing in a field with sheep .


  1%|          | 76/8946 [00:52<1:35:55,  1.54it/s]


 file name:  3089992945 \caption:   a man is playing a guitar .


  1%|          | 77/8946 [00:53<1:44:17,  1.42it/s]


 file name:  000000047916 \caption:  a small dog is playing with a frisbee in the grass.


  1%|          | 78/8946 [00:54<1:58:07,  1.25it/s]


 file name:  000000451131 \caption:  a picture of a person in a chair with a small table in front of it.


  1%|          | 79/8946 [00:55<1:54:52,  1.29it/s]


 file name:  000000063856 \caption:   a skateboarder is doing tricks on a skateboard .


  1%|          | 80/8946 [00:55<1:50:17,  1.34it/s]


 file name:  000000081784 \caption:  a man is on the beach holding a surfboard.


  1%|          | 81/8946 [00:56<1:47:06,  1.38it/s]


 file name:  000000213023 \caption:  a zebra is standing in a grassy area.


  1%|          | 82/8946 [00:57<1:49:48,  1.35it/s]


 file name:  000000181330 \caption:  a large metal suitcase is sitting on a bench near a bench.


  1%|          | 83/8946 [00:58<1:49:09,  1.35it/s]


 file name:  000000283382 \caption:  a kitchen with a sink and sink next to a sink.


  1%|          | 84/8946 [00:58<1:56:32,  1.27it/s]


 file name:  7130336193 \caption:  a group of people playing a game of soccer on a grassy field.


  1%|          | 85/8946 [00:59<1:56:36,  1.27it/s]


 file name:  000000216820 \caption:  a plate of fruit is on a plate with bananas and oranges.


  1%|          | 86/8946 [01:00<1:53:59,  1.30it/s]


 file name:  000000345160 \caption:  a woman and a child are sitting on a motorbike.


  1%|          | 87/8946 [01:01<1:54:54,  1.29it/s]


 file name:  000000307989 \caption:  two cats are sitting on a wooden tray next to a bowl.


  1%|          | 88/8946 [01:01<1:50:24,  1.34it/s]


 file name:  000000217393 \caption:  a man standing next to a deer in a pasture.


  1%|          | 89/8946 [01:02<1:42:19,  1.44it/s]


 file name:  000000447879 \caption:  a bird is perched on a tree branch.


  1%|          | 90/8946 [01:02<1:29:11,  1.65it/s]


 file name:  5109882423 \caption:   a man dressed in traditional clothing


  1%|          | 91/8946 [01:03<1:34:57,  1.55it/s]


 file name:  000000556892 \caption:  a man laying on the beach laying on a surfboard.


  1%|          | 92/8946 [01:04<1:46:50,  1.38it/s]


 file name:  000000241918 \caption:  a man in a blue jacket and ski pants standing on a snowy slope.


  1%|          | 93/8946 [01:05<1:42:27,  1.44it/s]


 file name:  000000545950 \caption:  a man riding a horse on a dirt road.


  1%|          | 94/8946 [01:05<1:41:52,  1.45it/s]


 file name:  118717792 \caption:   a group of people are playing drums in a park .


  1%|          | 95/8946 [01:06<1:41:31,  1.45it/s]


 file name:  000000361253 \caption:  a pair of scissors sitting on top of a paper.


  1%|          | 96/8946 [01:07<1:43:50,  1.42it/s]


 file name:  000000139040 \caption:  a man in a blue helmet is sitting on a motorcycle.


  1%|          | 97/8946 [01:08<1:48:16,  1.36it/s]


 file name:  000000305451 \caption:  a man and a woman are playing a game on a table.


  1%|          | 98/8946 [01:08<1:45:54,  1.39it/s]


 file name:  000000330356 \caption:  a giraffe is standing in a grassy field.


  1%|          | 99/8946 [01:09<1:39:17,  1.49it/s]


 file name:  000000169683 \caption:  a herd of sheep grazing in a field.


  1%|          | 100/8946 [01:10<1:57:39,  1.25it/s]


 file name:  000000126263 \caption:  a black and white photo of a black and white photo of a black and white dog.


  1%|          | 101/8946 [01:11<1:59:59,  1.23it/s]


 file name:  000000408425 \caption:  a room with a bed, chair, chair, and a chair.


  1%|          | 102/8946 [01:11<1:48:59,  1.35it/s]


 file name:  000000100667 \caption:  two men are holding bananas on their fingers.


  1%|          | 103/8946 [01:12<1:48:30,  1.36it/s]


 file name:  000000018183 \caption:  a clock is lit up on the wall of a building.


  1%|          | 104/8946 [01:13<1:46:01,  1.39it/s]


 file name:  000000206731 \caption:  a blue teddy bear is sitting on a counter.


  1%|          | 105/8946 [01:13<1:41:23,  1.45it/s]


 file name:  3315250232 \caption:  a little girl running on a grassy field.


  1%|          | 106/8946 [01:14<1:43:34,  1.42it/s]


 file name:  000000072396 \caption:  a woman in a blue shirt is sitting at a table.


  1%|          | 107/8946 [01:15<1:42:25,  1.44it/s]


 file name:  000000329175 \caption:  a skateboarder doing a trick on a ramp.


#### CIDEr & CLIPScore

In [None]:
# !gdown 1SUiRrG6zQVtyrVSVh9hOBq5_fX-oV2Lh -O hw3_data.zip # 11rP6KmR5Qwjhx0rfag0b5TZGBTRuPtQR
# !unzip /content/hw3_data.zip

In [None]:
# import json

# with open('/content/large_epoch6_output.json', 'r') as file:
#     data = json.load(file)

In [None]:
# !gdown 1VKQV8b5IIL6AWXnNlOXe45_njmUDyWCP -O large_epoch5_output.json

In [None]:
# !pip install git+https://github.com/openai/CLIP.git

In [None]:
# !pip install git+https://github.com/bckim92/language-evaluation.git
# !python -c "import language_evaluation; language_evaluation.download('coco')"

In [None]:
# !python3 /content/p2_evaluate.py --pred_file /content/large_epoch6_output.json --annotation_file /content/hw3_data/p2_data/val.json --images_root /content/hw3_data/p2_data/images/val