# Image Captioning with Transformers

In [1]:
!nvidia-smi

Sun Jun 27 02:53:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!apt install -qq pigz
%pip install -q timm wandb
%pip install -q --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110

The following NEW packages will be installed:
  pigz
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 57.4 kB of archives.
After this operation, 259 kB of additional disk space will be used.
Selecting previously unselected package pigz.
(Reading database ... 160772 files and directories currently installed.)
Preparing to unpack .../archives/pigz_2.4-1_amd64.deb ...
Unpacking pigz (2.4-1) ...
Setting up pigz (2.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
[K     |████████████████████████████████| 348kB 5.9MB/s 
[K     |████████████████████████████████| 1.8MB 8.1MB/s 
[K     |████████████████████████████████| 133kB 21.7MB/s 
[K     |████████████████████████████████| 174kB 20.4MB/s 
[K     |████████████████████████████████| 102kB 9.4MB/s 
[K     |████████████████████████████████| 71kB 7.8MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[K     |███

In [3]:
!git clone https://github.com/ShivamShrirao/Image-Captioning-Transformers

Cloning into 'Image-Captioning-Transformers'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 46 (delta 16), reused 38 (delta 8), pack-reused 0[K
Unpacking objects: 100% (46/46), done.


# Download Dataset and Annotations

In [4]:
!mkdir ~/.kaggle/
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d shivamshrirao/coco-trainval2017-320x320

Downloading coco-trainval2017-320x320.zip to /content
100% 3.45G/3.46G [00:37<00:00, 108MB/s]
100% 3.46G/3.46G [00:37<00:00, 100MB/s]


In [6]:
!unzip -q coco-trainval2017-320x320.zip

In [7]:
# !gdown --id 1-3vdwBlY-CdVultkrFwOhyJTGC5TFUV8

In [8]:
# !pigz -dc coco_trainval2017_320x320.tar.gz | tar xf -

In [9]:
from torchvision.datasets.utils import download_and_extract_archive
DATA_DIR = "datasets/COCO"

In [10]:
download_and_extract_archive("http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
                             download_root=DATA_DIR,
                             remove_finished=True)

Downloading http://images.cocodataset.org/annotations/annotations_trainval2017.zip to datasets/COCO/annotations_trainval2017.zip


HBox(children=(FloatProgress(value=0.0, max=252907541.0), HTML(value='')))


Extracting datasets/COCO/annotations_trainval2017.zip to datasets/COCO


In [11]:
!rm coco-trainval2017-320x320.* datasets/COCO/annotations_trainval2017.zip

# Import libraries

In [12]:
%cd Image-Captioning-Transformers

/content/Image-Captioning-Transformers


In [13]:
#hide
%load_ext autoreload
%autoreload 2

In [14]:
# TODO: Try pre trained CLIP

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.models as models
from torchvision import datasets, transforms as T

In [16]:
import math
import random
from random import randint
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import OrderedDict

In [17]:
import timm

In [18]:
plt.rcParams['figure.facecolor'] = 'white'

# Wandb Parameters

In [19]:
import wandb

In [20]:
config_defaults = {
    'BATCH_SIZE'        : 256,
    'd_model'           : 512,
    'dim_feedforward'   : 1024,
    'nheads'            : 8,
    'num_decoder_layers': 6,
    'dp_rate'           : 0.1,
    'encoder'           : 'seresnext50_32x4d',
    'activation'        : 'gelu',
    'max_lr'            : 6e-4,
    'betas'             : (0.9, 0.98),
    'eps'               : 1e-9,
    'seed'              : 62134,
    'use_amp'           : True,
    'use_pe'            : True,
    'log_interval'      : 5,
}
CONFIG = config_defaults

In [21]:
# #hide
# run = wandb.init(id='xjgc55j0', project="Image_Captioning_Transformer", resume='must')
# CONFIG = run.config

In [22]:
run = wandb.init(project="Image_Captioning_Transformer", entity="shivamshrirao", config=config_defaults)
CONFIG = wandb.config

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [24]:
def seed_everything(seed=33):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True
    
seed_everything(CONFIG['seed'])

# Preprocessing Transforms

In [25]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
input_size = 224

In [27]:
# first transform crop while loading, then do rest later in batch on device
preproc = {
    'train': T.Compose([
        T.RandomResizedCrop(input_size, interpolation=T.InterpolationMode.BICUBIC),
        T.RandomHorizontalFlip(input_size),
        lambda image: image.convert("RGB"),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
    'val': T.Compose([
        T.Resize(input_size, interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(input_size),
        lambda image: image.convert("RGB"),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

## Read COCO dataset

In [28]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchvision.io import read_file, decode_jpeg, ImageReadMode
import torchtext
import os

In [29]:
DATA_DIR = "../datasets/COCO/"

In [30]:
class TensorCocoCaptions(datasets.CocoDetection):
    def __getitem__(self, index: int):
        return super().__getitem__(index % len(self.ids))
    
    def _load_target(self, id):
        return self.tokens_dict[id]
    
    def fill_token_dict(self, tokenizer, vocab, bos_idx, eos_idx):
        self.tokens_dict = {}                       # To save preprocessed captions as tokens.
        for id in tqdm(self.ids):
            captions = self._load_caption(id)
            self.tokens_dict[id] = [torch.tensor([bos_idx] + vocab(tokenizer(cap)) + [eos_idx]#, dtype=torch.int32)
                                    for cap in captions]
    
    def _load_caption(self, id):
        return [ann["caption"] for ann in super()._load_target(id)]

    def _load_image(self, id):
        path = self.coco.loadImgs(id)[0]["file_name"]
        data = read_file(os.path.join(self.root, path))
        return data
        # return decode_jpeg(data, ImageReadMode.RGB)#, device=DEVICE)

In [31]:
train_data = TensorCocoCaptions(root=DATA_DIR+"/train2017/",
                                annFile=DATA_DIR+"/annotations/captions_train2017.json")

val_data = TensorCocoCaptions(root=DATA_DIR+"/val2017/",
                              annFile=DATA_DIR+"/annotations/captions_val2017.json")

loading annotations into memory...
Done (t=1.07s)
creating index...
index created!
loading annotations into memory...
Done (t=0.06s)
creating index...
index created!


## Tokenizer and Build Vocab

In [32]:
tokenizer = get_tokenizer('basic_english')

In [33]:
def yield_tokens(cap_data):
    for ann in cap_data.coco.anns.values():
        yield tokenizer(ann['caption'])

In [34]:
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
en_vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=special_symbols, special_first=True)

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = en_vocab(special_symbols)
en_vocab.set_default_index(UNK_IDX)

In [35]:
len(en_vocab)

28940

In [36]:
train_data.fill_token_dict(tokenizer, en_vocab, BOS_IDX, EOS_IDX)
val_data.fill_token_dict(tokenizer, en_vocab, BOS_IDX, EOS_IDX)

100%|██████████| 118287/118287 [00:22<00:00, 5319.52it/s]
100%|██████████| 5000/5000 [00:00<00:00, 5530.46it/s]


## Pretrained Glove Embeddings (not used rn)

In [37]:
# vec.get_vecs_by_tokens(tokens, lower_case_backup=True)

In [38]:
# vec = torchtext.vocab.GloVe('6B', dim=300)
# unk_vec = vec.vectors.mean(dim=0)
# vec.unk_init = lambda x: unk_vec

# Load dataset into batches

In [39]:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
from nvidia.dali.plugin.pytorch import DALIClassificationIterator
from random import shuffle

In [40]:
class ExternalInputIterator(object):
    def __init__(self, dataset, batch_size, training=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.training = training
        if self.training: shuffle(train_data.ids)

    def __iter__(self):
        self.idx = 0
        if self.training: shuffle(train_data.ids)
        return self

    def __next__(self):
        img_batch = []
        cap_batch = []

        if self.idx >= len(self.dataset):
            self.__iter__()
            raise StopIteration

        for _ in range(self.batch_size):
            img, caps = self.dataset[self.idx]
            img_batch.append(img)
            cap = caps[randint(0,len(caps)-1) if self.training else 0]
            cap_batch.append(cap)
            self.idx += 1
        cap_batch = pad_sequence(cap_batch, batch_first=True, padding_value=PAD_IDX)#.type(torch.long)
        return (img_batch, cap_batch)

    def __len__(self):
        return len(self.dataset)

    next = __next__

In [41]:
def ExternalSourcePipeline(batch_size, num_threads, device_id, external_data, training=True):
    pipe = Pipeline(batch_size, num_threads, device_id)
    with pipe:
        images, labels = fn.external_source(source=external_data, num_outputs=2)
        if training:
            images = fn.decoders.image_random_crop(images, device='mixed', output_type=types.RGB, num_attempts=100, memory_stats=True)
            mirror = fn.random.coin_flip(probability=0.5)
        else:
            images = fn.decoders.image(images, device='mixed', output_type=types.RGB)
            mirror = False
        images = fn.resize(images, device='gpu', resize_shorter=input_size, interp_type=types.INTERP_TRIANGULAR)
        images = fn.crop_mirror_normalize(images.gpu(),
                                          dtype=types.FLOAT,
                                          output_layout="CHW",
                                          crop=(input_size, input_size),
                                          mean=[0.485 * 255,0.456 * 255,0.406 * 255],
                                          std=[0.229 * 255,0.224 * 255,0.225 * 255],
                                          mirror=mirror)
        labels = labels.gpu()
        pipe.set_outputs(images, labels)
    return pipe

In [None]:
train_iter = ExternalInputIterator(train_data, CONFIG['BATCH_SIZE'])
pipe = ExternalSourcePipeline(batch_size=CONFIG['BATCH_SIZE'], num_threads=4, device_id=0, external_data=train_iter)
train_loader = DALIClassificationIterator(pipe, dynamic_shape=True, auto_reset=True, last_batch_padded=True, size=len(train_iter))

val_iter = ExternalInputIterator(val_data, CONFIG['BATCH_SIZE'], training=False)
pipe = ExternalSourcePipeline(batch_size=CONFIG['BATCH_SIZE'], num_threads=4, device_id=0, external_data=val_iter, training=False)
val_loader = DALIClassificationIterator(pipe, dynamic_shape=True, auto_reset=True, last_batch_padded=True, size=len(val_iter))

# Initialize Model

In [43]:
from imcap.layers import *
from imcap.utils import *

In [None]:
model = CaptionModel(encoder = timm.create_model(CONFIG['encoder'], pretrained=True, num_classes=0, global_pool=''),
                     vocab_size = len(en_vocab),
                     num_decoder_layers = CONFIG['num_decoder_layers'],
                     nheads = CONFIG['nheads'],
                     d_model = CONFIG['d_model'],
                     dim_feedforward = CONFIG['dim_feedforward'],
                     dp_rate = CONFIG['dp_rate'],
                     activation = CONFIG['activation']).to(DEVICE, non_blocking=True)

# Learning Rate Schedule

In [45]:
steps_per_epoch = len(train_loader)

In [None]:
# def lr_schedule(step, d_model=512, warmup_steps=2*steps_per_epoch):
#     # return 1
#     step = max(1,step)
#     arg1 = step ** -0.5
#     arg2 = step * (warmup_steps ** -1.5)
#     return (d_model ** -0.6) * min(arg1, arg2)

In [None]:
# plt.plot([scheduler.get_last_lr()[0] for _ in range(steps_per_epoch*50) if not scheduler.step()])
# plt.show()

In [None]:
# plt.plot(list(map(lr_schedule, range(steps_per_epoch*50))))
# plt.show()

# Loss Function and Optimizer

In [46]:
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=CONFIG['max_lr'], betas=CONFIG['betas'], eps=CONFIG['eps']
)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=CONFIG['max_lr'], total_steps=50*steps_per_epoch, pct_start=0.04, final_div_factor=0.31)

scaler = torch.cuda.amp.GradScaler(enabled=CONFIG['use_amp'])

In [47]:
wandb.watch(model, log=None)

[<wandb.wandb_torch.TorchGraph at 0x7f04b452bc50>]

# Training functions

In [48]:
from torch.cuda import amp

In [49]:
def train_epoch(model, train_loader, optimizer, scaler, scheduler, epoch=1, use_amp=True, log_interval=10):
    model.train()
    model.encoder.eval()
    losses = 0
    with tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch}") as pbar:
        for idx, batch in pbar:
            img, tgt = batch[0]['data'], batch[0]['label'].transpose(0,1)
            # img = img.to(DEVICE, non_blocking=True)
            # tgt = tgt.to(DEVICE, non_blocking=True)
            
            tgt_inp = tgt[:-1,:]      # give input until before the last word.
            tgt_out = tgt[1:, :]      # predict the last word based on input and already predicted sentence. (auto-regressive)

            tgt_mask, tgt_pad_mask = subsequent_mask(tgt_inp.size(0), DEVICE), padding_mask(tgt_inp, PAD_IDX)

            optimizer.zero_grad(set_to_none=True)
            with amp.autocast(enabled=use_amp):
                logits = model(img, tgt_inp, tgt_mask, tgt_pad_mask)
                loss = loss_fn(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            losses+= loss.detach_()
            del loss, logits, batch, img

            if not idx%log_interval:
                curr_lr = optimizer.param_groups[0]['lr']
                losses = float(losses)
                info = {'loss': losses/(idx+1), 'lr': curr_lr}
                wandb.log(info)
                pbar.set_postfix(info)

    optimizer.zero_grad(set_to_none=True)
    return float(losses)/len(train_loader)

In [50]:
@torch.no_grad()
def evaluate(model, val_loader, use_amp=True):
    model.eval()
    losses = 0
    with tqdm(enumerate(val_loader), total=len(val_loader), desc="Evaluating") as pbar:
        for idx, batch in pbar:
            img, tgt = batch[0]['data'], batch[0]['label'].transpose(0,1)
            # img = img.to(DEVICE, non_blocking=True)
            # tgt = tgt.to(DEVICE, non_blocking=True)

            tgt_inp = tgt[:-1,:]      # give input until before the last word.
            tgt_out = tgt[1:, :]      # predict the last word based on input and already predicted sentence. (auto-regressive)

            tgt_mask, tgt_pad_mask = subsequent_mask(tgt_inp.size(0), DEVICE), padding_mask(tgt_inp, PAD_IDX)
            
            with amp.autocast(enabled=use_amp):
                logits = model(img, tgt_inp, tgt_mask, tgt_pad_mask)
                loss = loss_fn(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

            losses+= float(loss.detach_())
            pbar.set_postfix({'val_loss': losses/(idx+1)})
    return float(losses)/len(val_loader)

# Functions to Make Predictions

In [51]:
@torch.no_grad()
def greedy_decode(model, img, max_len=100, start_symbol=BOS_IDX):
    model.eval()
    img = img.to(DEVICE, non_blocking=True)
    enc_output = model.encode_image(img)
    tgt = torch.ones(1, 1).fill_(start_symbol).long().to(DEVICE, non_blocking=True)
    for i in range(max_len):
        tgt_mask = subsequent_mask(tgt.size(0), DEVICE)
        out = model.decode_text(tgt, enc_output, tgt_mask)
        out = out.transpose(0,1)
        prob = model.generator(out[:,-1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        tgt = torch.cat([tgt, torch.ones(1, 1).fill_(next_word).long().to(DEVICE)], dim=0)
        if next_word == EOS_IDX:
            break
    return tgt.detach()

@torch.no_grad()
def generate_caption(model, img, tgt_vocab):
    tgt = greedy_decode(model, img, max_len=100, start_symbol=BOS_IDX).flatten()
    return " ".join(tgt_vocab.lookup_tokens(tgt.tolist())).replace("<bos>", "").replace("<eos>", "")

# Begin Training

In [53]:
init_epoch = 1
NUM_EPOCHS = 50

In [54]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [55]:
import glob
val_paths = glob.glob("../datasets/COCO/val2017/*")

In [None]:
#collapse-output
for epoch in range(init_epoch, NUM_EPOCHS+1):
    train_loss = train_epoch(model, train_loader, optimizer, scaler, scheduler,
                             epoch, CONFIG['use_amp'], CONFIG['log_interval'])
    with torch.no_grad():
        val_loss = evaluate(model, val_loader, CONFIG['use_amp'])

        img = Image.open(random.choice(val_paths))
        caps = generate_caption(model, preproc['val'](img)[None,:], en_vocab)
        wandb.log({"val_loss": val_loss, "epoch": epoch, "examples": wandb.Image(img, caption=caps)})
        print(f"\nEpoch: {epoch}/{NUM_EPOCHS}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}\n")
    # gc.collect()
    # if not epoch%10:
    #     save_model(model, optimizer, epoch)

Epoch 1: 100%|██████████| 463/463 [06:33<00:00,  1.18it/s, loss=4.95, lr=0.000311]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.9450, device='cuda:0')]
Epoch 2:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 1/50, Train loss: 4.942, Val loss: 2.945



Epoch 2: 100%|██████████| 463/463 [06:28<00:00,  1.19it/s, loss=2.92, lr=0.0006]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.4849, device='cuda:0')]
Epoch 3:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 2/50, Train loss: 2.921, Val loss: 2.485



Epoch 3: 100%|██████████| 463/463 [06:24<00:00,  1.20it/s, loss=2.65, lr=0.000599]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.3640, device='cuda:0')]
Epoch 4:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 3/50, Train loss: 2.650, Val loss: 2.364



Epoch 4: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.55, lr=0.000598]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.2774, device='cuda:0')]
Epoch 5:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 4/50, Train loss: 2.545, Val loss: 2.277



Epoch 5: 100%|██████████| 463/463 [06:27<00:00,  1.19it/s, loss=2.48, lr=0.000595]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.2343, device='cuda:0')]
Epoch 6:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 5/50, Train loss: 2.479, Val loss: 2.234



Epoch 6: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.42, lr=0.000591]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.2035, device='cuda:0')]
Epoch 7:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 6/50, Train loss: 2.425, Val loss: 2.204



Epoch 7: 100%|██████████| 463/463 [06:28<00:00,  1.19it/s, loss=2.4, lr=0.000586]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.1791, device='cuda:0')]
Epoch 8:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 7/50, Train loss: 2.395, Val loss: 2.179



Epoch 8: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.36, lr=0.00058]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.1588, device='cuda:0')]
Epoch 9:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 8/50, Train loss: 2.363, Val loss: 2.159



Epoch 9: 100%|██████████| 463/463 [06:27<00:00,  1.20it/s, loss=2.34, lr=0.000573]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.57it/s, val_loss=tensor(2.1302, device='cuda:0')]
Epoch 10:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 9/50, Train loss: 2.337, Val loss: 2.130



Epoch 10: 100%|██████████| 463/463 [06:26<00:00,  1.20it/s, loss=2.31, lr=0.000565]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.1230, device='cuda:0')]
Epoch 11:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 10/50, Train loss: 2.314, Val loss: 2.123



Epoch 11: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.3, lr=0.000556]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.57it/s, val_loss=tensor(2.1109, device='cuda:0')]
Epoch 12:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 11/50, Train loss: 2.295, Val loss: 2.111



Epoch 12: 100%|██████████| 463/463 [06:27<00:00,  1.20it/s, loss=2.28, lr=0.000546]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0954, device='cuda:0')]
Epoch 13:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 12/50, Train loss: 2.279, Val loss: 2.095



Epoch 13: 100%|██████████| 463/463 [06:28<00:00,  1.19it/s, loss=2.26, lr=0.000535]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0877, device='cuda:0')]
Epoch 14:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 13/50, Train loss: 2.263, Val loss: 2.088



Epoch 14: 100%|██████████| 463/463 [06:28<00:00,  1.19it/s, loss=2.25, lr=0.000523]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0774, device='cuda:0')]
Epoch 15:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 14/50, Train loss: 2.248, Val loss: 2.077



Epoch 15: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.24, lr=0.000511]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0666, device='cuda:0')]
Epoch 16:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 15/50, Train loss: 2.236, Val loss: 2.067



Epoch 16: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.22, lr=0.000498]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0582, device='cuda:0')]
Epoch 17:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 16/50, Train loss: 2.221, Val loss: 2.058



Epoch 17: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.2, lr=0.000484]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0500, device='cuda:0')]
Epoch 18:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 17/50, Train loss: 2.203, Val loss: 2.050



Epoch 18: 100%|██████████| 463/463 [06:27<00:00,  1.19it/s, loss=2.19, lr=0.000469]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0426, device='cuda:0')]
Epoch 19:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 18/50, Train loss: 2.190, Val loss: 2.043



Epoch 19: 100%|██████████| 463/463 [06:29<00:00,  1.19it/s, loss=2.18, lr=0.000454]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0358, device='cuda:0')]
Epoch 20:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 19/50, Train loss: 2.181, Val loss: 2.036



Epoch 20: 100%|██████████| 463/463 [06:28<00:00,  1.19it/s, loss=2.17, lr=0.000439]
Evaluating: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s, val_loss=tensor(2.0282, device='cuda:0')]
Epoch 21:   0%|          | 0/463 [00:00<?, ?it/s]


Epoch: 20/50, Train loss: 2.171, Val loss: 2.028



Epoch 21:  12%|█▏        | 55/463 [00:47<05:44,  1.18it/s, loss=2.15, lr=0.000437]

In [None]:
init_epoch = epoch
init_epoch

In [None]:
# def save_model(model, optimizer, epoch):
#     torch.save({
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'epoch': epoch,
#                 }, '/content/model.pth')

# Make Predictions

In [None]:
img = Image.open(random.choice(val_paths))
caps = generate_caption(model, preproc['val'](img)[None,:], en_vocab)
# wandb.log({"examples": wandb.Image(img, caption=caps)})
print(caps)
img

In [None]:
run.finish()