# Setup

## Setup CLIP

In [None]:
!git clone https://github.com/openai/CLIP.git
!pip install -e CLIP/

fatal: destination path 'CLIP' already exists and is not an empty directory.
Obtaining file:///content/CLIP
Installing collected packages: clip
  Found existing installation: clip 1.0
    Can't uninstall 'clip'. No files were found to uninstall.
  Running setup.py develop for clip
Successfully installed clip


In [None]:
from CLIP.clip import *
clip, clip_image_preprocess = load('ViT-B/32', jit=False)

## Setup BERT

In [None]:
!pip install transformers



In [None]:
# Getting familar with transformers
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer("Hello", return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[ 101, 7592,  102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}

In [None]:
import torch.nn as nn

class BertEmbeddingsWithVisualEmbedding(nn.Module):
  """
  Construct the embeddings from word, position, token_type embeddings and visual embeddings.
  """
  def __init__(self, vocab_size, max_position_embeddings, type_vocab_size,
               visual_embedding_dim, # 512 for CLIP 
               hidden_size, hidden_dropout_prob):
    super(BertEmbeddingsWithVisualEmbedding, self).__init__()
    self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
    self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
    self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
    # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
    # any TensorFlow checkpoint file
    self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-12)
    self.dropout = nn.Dropout(hidden_dropout_prob)
    #### Below are specific for encoding visual features

    # Segment and position embedding for image features
    self.token_type_embeddings_visual = nn.Embedding(type_vocab_size, hidden_size)
    self.position_embeddings_visual = nn.Embedding(max_position_embeddings, hidden_size)
    self.projection = nn.Linear(visual_embedding_dim, hidden_size)

  def forward(self, input_ids, visual_embeddings):
    """
    input_ids = [batch_size, sequence_length]
    token_type_ids = [batch_size, sequence_length]
    visual_embedding = [batch_size, image_feature_length, image_feature_dim]
    """
    # Compute embeddings for Text
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    token_type_ids = torch.zeros_like(input_ids) # for now, set token_type_ids to None
    words_embeddings = self.word_embeddings(input_ids)
    position_embeddings = self.position_embeddings(position_ids)
    token_type_embeddings = self.token_type_embeddings(token_type_ids)
    text_embeddings = words_embeddings + position_embeddings + token_type_embeddings

    # Compute embeddings for Image
    visual_embeddings = self.projection(visual_embeddings)
    # print("text embedding size:", text_embeddings.shape)
    embeddings = torch.cat((text_embeddings, visual_embeddings), dim=1)
    embeddings = self.LayerNorm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings

In [None]:
# Testing Bert
PRETRAINED_BERT_VOCAB_SIZE = 30522
BertEmbeddingsWithVisualEmbedding(PRETRAINED_BERT_VOCAB_SIZE, 10, 2, 
                                  visual_embedding_dim=512, 
                                  hidden_size=768, hidden_dropout_prob=0.1)(inputs['input_ids'], torch.randn(1, 1, 512)).shape

torch.Size([1, 4, 768])

In [None]:
from transformers import BertModel, BertConfig


class VisualBERTCLIP(nn.Module):
  def __init__(self):
    super().__init__()
    config = BertConfig()
    bert = BertModel(config)
    self.clip = clip.eval()
    self.embedding = BertEmbeddingsWithVisualEmbedding(PRETRAINED_BERT_VOCAB_SIZE, 512, 2, 512, 768, 0.1)
    self.encoder = bert.encoder
    self.pooler = bert.pooler
    self.classifier = nn.Linear(768, 2)

  def forward(self, text, image):
    image_feat = self.clip.encode_image(image)
    embeddings = self.embedding(text['input_ids'], image_feat.unsqueeze(1))
    outputs = self.encoder(embeddings)
    outputs = self.pooler(outputs[0])
    logits = self.classifier(outputs)
    return logits

In [None]:
visualBERTCLIP = VisualBERTCLIP()

# Training

## Setup Training

In [None]:
"""
Most of the source code is taken from
https://www.drivendata.co/blog/hateful-memes-benchmark/
"""
from torch.utils.data import Dataset
import pandas as pd
import torch

from PIL import Image


class HatefulMemesDataset(Dataset):

    def __init__(self, jsonl_path, path_to_img_dir,
                 image_transform, text_transform):
        """
        :param jsonl_path: path to jsonl provided by Facebook (e.g. data/train.jsonl
        :param path_to_img_dir: path to parent directory of img dir
        :param image_transform: torchvision.transforms.Compose
        :param text_transform: (texts: Union[str, List[str]]) -> torch.LongTensor
        """
        self.samples_frame = pd.read_json(jsonl_path, lines=True)
        self.samples_frame = self.samples_frame.reset_index(drop=True)
        self.samples_frame.img = self.samples_frame.apply(lambda row: (path_to_img_dir + '/' + row.img), axis=1)
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        return len(self.samples_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        img_id = self.samples_frame.loc[idx, "id"]
        image = Image.open(self.samples_frame.loc[idx, "img"]).convert("RGB")
        image = self.image_transform(image)

        # TODO: Find a better way for reducing length of a sentence.
        # this is an actual sentence from the dataset:
        #
        # we only want to make you register them, restrict transfers, 
        # ban certain guns, limit magazine capacity, prohibit carrying them, 
        # ban or limit ammo, make other arbitrary laws, and, if we catch you 
        # violating any of these made-up rules, throw you in prison.... at 
        # which point we will take your guns!

        text = self.samples_frame.loc[idx, "text"]
        text = self.text_transform(text)
        # print(text)

        # case: development
        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            return image, text, label
        else:
            # case: inference
            return image, text

In [None]:
"""
Simple training loop
"""
import math
import logging
from itertools import chain

from tqdm.notebook import tqdm
import numpy as np

import torch

from torch.utils.data import DistributedSampler, DataLoader

from torch.cuda.amp import autocast

class Trainer:
    def __init__(self, model, loss_f, image_preprocess, text_preprocess, h, ckpt_path):
        """
        :param model: torch.Module(text, image) -> 0 or 1 (binary classification)
        :param loss_f: (model's output, target) -> a real number wrapped by torch.Tensor
        :param dictionary that contains the hyper-parameter values
        """
        self.model = model.cuda()
        self.loss_f = loss_f
        self.h = h
        self.image_preprocess = image_preprocess
        self.text_preprocess = text_preprocess
        self.ckpt_path = ckpt_path

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model
        print("saving", self.ckpt_path)
        torch.save(raw_model.state_dict(), self.ckpt_path)

    def train(self, trainset_jsonl, trainset_image_dir_path, valset_jsonl, valset_image_dir_path):
        model, loss_f, h = self.model, self.loss_f, self.h
        optimizer = torch.optim.Adam(chain(model.embedding.parameters(), model.encoder.parameters(), model.pooler.parameters(), model.classifier.parameters()), 
                                     h["lr"], h["betas"], h["eps"])


        train_dataset = HatefulMemesDataset(
            trainset_jsonl,
            trainset_image_dir_path,
            image_transform=self.image_preprocess,
            text_transform=self.text_preprocess)

        val_dataset = HatefulMemesDataset(
            valset_jsonl,
            valset_image_dir_path,
            image_transform=self.image_preprocess,
            text_transform=self.text_preprocess
        )

        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()

        def run_epoch(split):
            is_train = split == 'train'
            # Custom code for VisualBERT + CLIP
            model.embedding.train()
            model.encoder.train()
            model.pooler.train()
            model.classifier.train()

            data = train_dataset if is_train else val_dataset
            loader = DataLoader(data, shuffle=True, pin_memory=True,
                                batch_size=h["batch_size"],
                                num_workers=h["num_workers"])

            losses = []
            num_correct_pred = 0
            num_pred = 0
            pbar = tqdm(enumerate(loader), total=len(loader), position=0, leave=True) if is_train else enumerate(loader)
            for it, (image, text, label) in pbar:
                text = tokenizer(text, return_tensors="pt", padding=True)
                # place data on the correct device
                text = text.to(self.device)
                image = image.to(self.device)
                label = label.type(torch.LongTensor).to(self.device)

                with torch.set_grad_enabled(is_train):
                    with autocast():
                        
                        output = model(text=text, image=image)
                        loss = loss_f(output, label)
                        losses.append(loss.item())

                if is_train:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    pbar.set_description(f"epoch {epoch + 1} iter {it}: train loss {loss.item():.5f}")

                if not is_train:
                    prediction = torch.argmax(output, dim=1)
                    num_correct_pred += torch.sum(prediction == label)
                    num_pred += prediction.shape[0]

            if not is_train:
                test_loss = float(np.mean(losses))
                acc = num_correct_pred / num_pred
                print("test loss:", test_loss)
                print("test accL", acc)
                return test_loss

        best_loss = float('inf')
        for epoch in range(h["max_epochs"]):
            run_epoch('train')
            if val_dataset is not None:
                test_loss = run_epoch('test')

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = val_dataset is None or test_loss < best_loss
            if self.ckpt_path is not None and good_model:
                best_loss = test_loss
                self.save_checkpoint()

In [None]:
# Thank you Mario 🙏
from google.colab import drive
drive.mount('/gdrive')

!cp '/gdrive/MyDrive/MemesDeepLearning/dataFB.zip' '/content/data.zip'
!unzip -q data.zip

Mounted at /gdrive


## Start training!

In [None]:
import torch.nn.functional as F 


model = VisualBERTCLIP()
loss_f = nn.CrossEntropyLoss()
image_preprocess = clip_image_preprocess
text_preprocess = lambda x: x
h = {
  "lr": 0.0002,
  "betas": [0.9, 0.998],
  "batch_size": 128,
  "num_workers": 2,
  "max_epochs": 32,
  "eps": 0.000001
}


trainer = Trainer(model, loss_f, image_preprocess, text_preprocess, h, "experiment_0.pt")

In [None]:
trainer.train('data/train.jsonl', 'data', 'data/dev_seen.jsonl', 'data')

HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7593238949775696
test accL tensor(0.5060, device='cuda:0')
saving experiment_0.pt


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7163524180650711
test accL tensor(0.5060, device='cuda:0')
saving experiment_0.pt


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7110382318496704
test accL tensor(0.5060, device='cuda:0')
saving experiment_0.pt


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7190508097410202
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7097197473049164
test accL tensor(0.5060, device='cuda:0')
saving experiment_0.pt


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7403099238872528
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7368562817573547
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.711145430803299
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7279451191425323
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7335953563451767
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.724214106798172
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.8003707379102707
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7707920074462891
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7355788052082062
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7705855220556259
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.725182518362999
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7283454537391663
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7246707677841187
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7363489717245102
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7229664623737335
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7209437042474747
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7275687456130981
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7136619538068771
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.733703225851059
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7779152542352676
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.758018359541893
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7065934985876083
test accL tensor(0.5060, device='cuda:0')
saving experiment_0.pt


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7234548926353455
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7082326710224152
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7345936000347137
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7264803349971771
test accL tensor(0.5060, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


test loss: 0.7226838618516922
test accL tensor(0.5060, device='cuda:0')


In [None]:
1