In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torchvision import datasets, transforms as T

In [2]:
import timm

In [3]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from collections import OrderedDict

In [4]:
# TODO: Try pre trained CLIP

In [5]:
from torchvision.datasets.utils import download_and_extract_archive

In [6]:
# download_and_extract_archive("http://images.cocodataset.org/zips/train2017.zip",
#                              download_root="../datasets/COCO",
#                              remove_finished=True)

In [7]:
# download_and_extract_archive("http://images.cocodataset.org/zips/val2017.zip",
#                              download_root="../datasets/COCO",
#                              remove_finished=True)

In [8]:
# download_and_extract_archive("http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
#                              download_root="../datasets/COCO",
#                              remove_finished=True)

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
input_size = 224

In [11]:
preproc = {
    'train': T.Compose([
        T.RandomResizedCrop(input_size, interpolation=T.InterpolationMode.BICUBIC),
        T.RandomHorizontalFlip(input_size),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
    'val': T.Compose([
        T.Resize(input_size, interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(input_size),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [12]:
class ProjectionHead(nn.Module):
    def __init__(self, d_model=768, dp_rate=0.1):
        super().__init__()
        self.proj = nn.LazyLinear(d_model)
        self.activation = nn.GELU()
        self.dropout1 = nn.Dropout(dp_rate)
        self.dense = nn.LazyLinear(d_model)
        self.dropout2 = nn.Dropout(dp_rate)
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x):   # (..., features)
        p = self.proj(x)
        x = self.dropout1(self.activation(p))
        x = self.dropout2(self.dense(x))
        x = self.ln(x + p)
        return x

In [14]:
inp = torch.randn((1,3,224,224))

In [15]:
backbone = timm.create_model('seresnext50_32x4d', pretrained=False, num_classes=0, global_pool='')#, features_only=True)

In [18]:
out = backbone(inp).flatten(-2).permute(2,0,1)

In [19]:
out.shape

torch.Size([49, 1, 2048])

In [21]:
ph = ProjectionHead()

In [22]:
ph(out).shape

torch.Size([49, 1, 768])

In [13]:
# https://github.com/fastai/fastai2/blob/8d798c881c1eda564bdf92079bdfe43b43525767/fastai2/callback/training.py
bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)

def set_bn_eval(m:nn.Module):
    "Set bn layers in eval mode for all recursive children of `m`."
    for l in m.children():
        if isinstance(l, bn_types) and not next(l.parameters()).requires_grad:
            l.eval()
        set_bn_eval(l)

def freeze_weights(m):
    for param in m.parameters():
        param.requires_grad_(False)

In [14]:
class CaptionModel(nn.Module):
    def __init__(self, projection_head, tgt_vocab_size, num_decoder_layers=6, nhead=8, d_model=768,
                 dim_feedforward=2048, dp_rate=0.1, activation='relu', bn_eval=True):
        super().__init__()
        self.backbone = timm.create_model('seresnext50_32x4d', pretrained=False, num_classes=0, global_pool='')
        freeze_weights(self.backbone)
        if bn_eval: set_bn_eval(self.backbone)
        
        self.projection_head = ProjectionHead(d_model, dp_rate)

        self.embedding = nn.Embedding(vocab_size, d_model)
        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dp_rate,
                                                activation)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        self.generator = nn.Linear(d_model, tgt_vocab_size)
        self.scale = d_model**0.5
    

    
    def forward(self, x, tgt):   # x[B,C,H,W]  tgt[B,seq_len]
        x = self.backbone(x)
        # (B, features, h, w)
        x = x.flatten(-2)    # flatten each feature
        # (B, features, h*w) cause batch_first.
        x = x.permute(2,0,1)
        # (h*w, B, features)
        x = self.projection_head(x)
        # (h*w, B, d_model)

        tgt = self.embedding(tgt)*self.scale
        # (seq_len, B, d_model)
        tgt = tgt + self.pos_enc[:tgt.size(0), :]
        x = self.decoder(tgt, memory=x, tgt_mask=tgt_mask,
                         tgt_key_padding_mask=tgt_key_padding_mask)
        x = self.generator(x)
        return x

In [15]:
def subsequent_mask(sz):
    mask = torch.ones((1,1,sz,sz), device=device, dtype=bool).triu(1)
    return mask

In [None]:
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=CONFIG['lr'], betas=CONFIG['betas'], eps=CONFIG['eps']
)

In [39]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext

from collections import Counter

In [17]:
tokenizer = get_tokenizer('basic_english')

In [18]:
cap_data = datasets.CocoCaptions(root="../datasets/COCO/val2017/",
                                 annFile="../datasets/COCO/annotations/captions_val2017.json",
                                 transform=preproc['val'])

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


In [21]:
def yield_tokens(cap_data):
    for ann in cap_data.coco.anns.values():
        yield tokenizer(ann['caption'])

In [70]:
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [71]:
en_vocab = build_vocab_from_iterator(yield_tokens(cap_data), specials=special_symbols)

In [75]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = en_vocab(special_symbols)

In [76]:
en_vocab.set_default_index(UNK_IDX)

In [77]:
en_vocab(tokenizer('A black and white small dog sitting next to a brown and white small dog.'))

[4, 44, 11, 24, 45, 50, 15, 23, 13, 4, 113, 11, 24, 45, 50, 5]

In [66]:
tokens = tokenizer('A black and white small dog sitting next to a brown and white small dog.')

In [53]:
vec = torchtext.vocab.GloVe('6B', dim=300)

In [67]:
vec.get_vecs_by_tokens(tokens)

tensor([[-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
        [-0.0868,  0.0362,  0.4657,  ...,  0.0679,  0.0267,  0.2247],
        [ 0.0385, -0.0398,  0.0827,  ..., -0.3343,  0.0118,  0.0597],
        ...,
        [-0.4330,  0.3283, -0.0943,  ..., -0.1941, -0.1111, -0.0581],
        [-0.1104,  0.8122,  0.0737,  ...,  0.3394,  0.5799,  0.0681],
        [-0.1256,  0.0136,  0.1031,  ..., -0.3422, -0.0224,  0.1368]])

In [73]:
def generate_batch(data_batch):
#     en_batch = pad_sequence(en_batch, batch_first=False, padding_value=PAD_IDX)
    return data_batch

In [74]:
val_loader = torch.utils.data.DataLoader(cap_data,
                                         batch_size=2,
                                         shuffle=True,
                                         num_workers=3,
                                         collate_fn=generate_batch)

In [75]:
o = next(iter(val_loader))

In [93]:
o[0][1]

['Two small dogs on leashes walking on a brick pathway.',
 'Two different colored dogs standing on a brick walkway.',
 'a black and white dog and a brown and white dog both on leashes',
 'A black and white small dog sitting next to a brown and white small dog.',
 'TWO PET DOGS ARE ON A LEASH ']