In [None]:
# Thank you Mario 🙏
from google.colab import drive
drive.mount('/gdrive')
#drive.mount('/content/drive')

In [None]:
!cp '/gdrive/MyDrive/MemesDeepLearning/dataFB.zip' '/content/data.zip'
!unzip -q data.zip

In [None]:
!cp '/gdrive/MyDrive/MemesDeepLearning/test.jsonl' '/content/test.jsonl'

In [None]:
!cp '/gdrive/MyDrive/MemesDeepLearning/mmbt_model_best.zip' '/content/best_model.zip'
!unzip -q best_model.zip

In [None]:
!pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install sklearn pytorch-pretrained-bert numpy tqdm matplotlib

In [None]:
import torch
import torch.nn as nn
import torchvision
from pytorch_pretrained_bert.modeling import BertModel
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from pytorch_pretrained_bert import BertTokenizer
import json
import numpy as np
import os
from PIL import Image
from torch.utils.data import DataLoader
import functools

In [None]:
class Vocab(object):
    def __init__(self, emptyInit=False):
        if emptyInit:
            self.stoi, self.itos, self.vocab_sz = {}, [], 0
        else:
            self.stoi = {
                w: i
                for i, w in enumerate(["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
            }
            self.itos = [w for w in self.stoi]
            self.vocab_sz = len(self.itos)

    def add(self, words):
        cnt = len(self.itos)
        for w in words:
            if w in self.stoi:
                continue
            self.stoi[w] = cnt
            self.itos.append(w)
            cnt += 1
        self.vocab_sz = len(self.itos)

In [None]:
class ImageBertEmbeddings(nn.Module):
    def __init__(self, embeddings, vocab):
        super(ImageBertEmbeddings, self).__init__()
        self.img_embeddings = nn.Linear(2048, 768)
        self.position_embeddings = embeddings.position_embeddings
        self.token_type_embeddings = embeddings.token_type_embeddings
        self.word_embeddings = embeddings.word_embeddings
        self.LayerNorm = embeddings.LayerNorm
        self.dropout = nn.Dropout(p=0.1)
        self.vocab = vocab

    def forward(self, input_imgs, token_type_ids):
        bsz = input_imgs.size(0)
        seq_length = 3 + 2  # +2 for CLS and SEP Token

        cls_id = torch.LongTensor([self.vocab.stoi["[CLS]"]]).cuda()
        cls_id = cls_id.unsqueeze(0).expand(bsz, 1).cuda()
        cls_token_embeds = self.word_embeddings(cls_id).cuda()

        sep_id = torch.LongTensor([self.vocab.stoi["[SEP]"]]).cuda()
        sep_id = sep_id.unsqueeze(0).expand(bsz, 1)
        sep_token_embeds = self.word_embeddings(sep_id)

        imgs_embeddings = self.img_embeddings(input_imgs)
        token_embeddings = torch.cat(
            [cls_token_embeds, imgs_embeddings, sep_token_embeds], dim=1
        )

        position_ids = torch.arange(seq_length, dtype=torch.long).cuda()
        position_ids = position_ids.unsqueeze(0).expand(bsz, seq_length)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        embeddings = token_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class MultimodalBertEncoder(nn.Module):
    def __init__(self, vocab):
        super(MultimodalBertEncoder, self).__init__()
        bert = BertModel.from_pretrained("bert-base-uncased")
        self.txt_embeddings = bert.embeddings
        self.vocab = vocab
        self.img_embeddings = ImageBertEmbeddings(self.txt_embeddings, self.vocab)
        self.img_encoder = ImageEncoder()
        self.encoder = bert.encoder
        self.pooler = bert.pooler
        self.clf = nn.Linear(768, 2)

    def forward(self, input_txt, attention_mask, segment, input_img):
        bsz = input_txt.size(0)

        attention_mask = torch.cat(
            [
                torch.ones(bsz, 3 + 2).long().cuda(),
                attention_mask,#.cuda(),
            ],
            dim=1,
        )
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(
            dtype=next(self.parameters()).dtype
        )
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        img_tok = (
            torch.LongTensor(input_txt.size(0), 3 + 2)
            .fill_(0)
            .cuda()
        )
        img = self.img_encoder(input_img)  # BxNx3x224x224 -> BxNx2048
        img_embed_out = self.img_embeddings(img, img_tok)
        txt_embed_out = self.txt_embeddings(input_txt, segment)
        encoder_input = torch.cat([img_embed_out, txt_embed_out], 1)  # Bx(TEXT+IMG)xHID

        encoded_layers = self.encoder(
            encoder_input, extended_attention_mask, output_all_encoded_layers=False
        )

        return self.pooler(encoded_layers[-1])


class MultimodalBertClf(nn.Module):
    def __init__(self, vocab):
        super(MultimodalBertClf, self).__init__()
        self.enc = MultimodalBertEncoder(vocab)
        self.clf = nn.Linear(768, 2)

    def forward(self, txt, mask, segment, img):
        x = self.enc(txt, mask, segment, img)
        return self.clf(x)


In [None]:
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        model = torchvision.models.resnet152(pretrained=True)
        modules = list(model.children())[:-2]
        self.model = nn.Sequential(*modules)

        pool_func = nn.AdaptiveAvgPool2d
        self.pool = pool_func((3, 1))

    def forward(self, x):
        # Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
        out = self.pool(self.model(x))
        out = torch.flatten(out, start_dim=2)
        out = out.transpose(1, 2).contiguous()
        return out  # BxNx2048


class ImageClf(nn.Module):
    def __init__(self):
        super(ImageClf, self).__init__()
        self.img_encoder = ImageEncoder()
        self.clf = nn.Linear(2048 * 3, 2)

    def forward(self, x):
        x = self.img_encoder(x)
        x = torch.flatten(x, start_dim=1)
        out = self.clf(x)
        return out

In [None]:
def get_labels_and_frequencies(path):
    label_freqs = Counter()
    data_labels = [json.loads(line)["label"] for line in open(path)]
    if type(data_labels[0]) == list:
        for label_row in data_labels:
            label_freqs.update(label_row)
    else:
        label_freqs.update(data_labels)

    return list(label_freqs.keys()), label_freqs

def collate_fn(batch):
    lens = [len(row[0]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)

    mask_tensor = torch.zeros(bsz, max_seq_len).long()
    text_tensor = torch.zeros(bsz, max_seq_len).long()
    segment_tensor = torch.zeros(bsz, max_seq_len).long()

    img_tensor = None
    img_tensor = torch.stack([row[2] for row in batch])

    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        tokens, segment = input_row[:2]
        text_tensor[i_batch, :length] = tokens
        segment_tensor[i_batch, :length] = segment
        mask_tensor[i_batch, :length] = 1

    return text_tensor, segment_tensor, mask_tensor, img_tensor

In [None]:
import json
import numpy as np
import os
from PIL import Image

import torch
from torch.utils.data import Dataset

class JsonlDataset(Dataset):
    def __init__(self, data_path, tokenizer, transforms, vocab, train):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.n_classes = 2
        self.text_start_token = ["[SEP]"]
        self.train = train

        self.max_seq_len = 512
        self.max_seq_len -= 3

        self.transforms = transforms

        if train:
            self.labels = get_labels_and_frequencies(data_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = (
            self.text_start_token
            + self.tokenizer(self.data[index]["text"])[
                : (512 - 1)
            ]
        )
        segment = torch.zeros(len(sentence))

        sentence = torch.LongTensor(
            [
                self.vocab.stoi[w] if w in self.vocab.stoi else self.vocab.stoi["[UNK]"]
                for w in sentence
            ]
        )

        if self.train:
            label = torch.LongTensor(
                [self.labels.index(self.data[index]["label"])]
            )

        image = None

        if self.data[index]["img"]:
            image = Image.open(
                os.path.join(self.data_dir, self.data[index]["img"])
            ).convert("RGB")
        else:
            image = Image.fromarray(128 * np.ones((256, 256, 3), dtype=np.uint8))
        image = self.transforms(image)

        # The first SEP is part of Image Token.
        segment = segment[1:]
        sentence = sentence[1:]
        # The first segment (0) is of images.
        segment += 1

        if self.train:
            return sentence, segment, image, label
        else:
            return sentence, segment, image


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True).tokenize
transform = transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.46777044, 0.44531429, 0.40661017],
                std=[0.12221994, 0.12145835, 0.14380469],
            ),
        ]
    )
vocab = Vocab()

In [None]:
model = MultimodalBertClf(vocab)
best_checkpoint = torch.load('./model_best.pt', map_location=torch.device('cpu'))
model.load_state_dict(best_checkpoint["state_dict"])

In [None]:
model.eval()
model.cuda()

In [None]:
collate = functools.partial(collate_fn)

test_set = JsonlDataset(
    "./data/test.jsonl",
    tokenizer,
    transform,
    vocab,
    False,
)

test_loader = DataLoader(
    test_set,
    batch_size=128,
    shuffle=False,
    num_workers=1,
    collate_fn=collate,
)

val_set = JsonlDataset(
    "./data/dev_seen.jsonl",
    tokenizer,
    transform,
    vocab,
    False,
)

val_loader = DataLoader(
    test_set,
    batch_size=128,
    shuffle=False,
    num_workers=1,
    collate_fn=collate,
)

In [None]:
import json
import torch

resultTest = [json.loads(jline) for jline in open('/content/data/test.jsonl', 'r')]
print(len(resultTest))
resultTensorTest = torch.zeros(len(resultTest))
for i in range(len(resultTest)):
  resultTensorTest[i] = resultTest[i]['label']
print(resultTensorTest)

In [None]:
import json
import torch

resultVal = [json.loads(jline) for jline in open('/content/data/dev_seen.jsonl', 'r')]
print(len(resultVal))
resultTensorVal = torch.zeros(len(resultVal))
for i in range(len(resultVal)):
  resultTensorVal[i] = resultVal[i]['label']
print(resultTensorVal)

In [None]:
from sklearn import metrics

preds, probs = [], []
for batch in test_loader:
    with torch.no_grad():
        txt, segment, mask, img = batch
        txt, img = txt.cuda(), img.cuda()
        mask, segment = mask.cuda(), segment.cuda()
        out = model(txt, mask, segment, img)


        prob = torch.nn.functional.softmax(out, dim=1)[:, 1].cpu().detach().numpy()
        pred = torch.nn.functional.softmax(out, dim=1).argmax(dim=1).cpu().detach().numpy()

        probs.append(prob)
        preds.append(pred)

preds = np.concatenate(preds, axis=0)
probs = np.concatenate(probs, axis=0)

predictions = torch.from_numpy(preds)

numCorrect = (predictions == resultTensorTest).sum().item()
acc = (100.0 * numCorrect / resultTensorTest.shape[0])

fpr, tpr, thresholds = metrics.roc_curve(resultTensorTest, probs, pos_label=1)
auroc = metrics.auc(fpr, tpr)

print(f"Accuracy: {acc} AUROC: {auroc}")

with open("submission1.csv", "w") as f:
    f.write("proba,label\n")
    for i in range(preds.shape[0]):
        f.write(f"{probs[i]},{preds[i]}\n")

In [None]:
preds, probs = [], []
for batch in val_loader:
    with torch.no_grad():
        txt, segment, mask, img = batch
        txt, img = txt.cuda(), img.cuda()
        mask, segment = mask.cuda(), segment.cuda()
        out = model(txt, mask, segment, img)


        prob = torch.nn.functional.softmax(out, dim=1)[:, 1].cpu().detach().numpy()
        pred = torch.nn.functional.softmax(out, dim=1).argmax(dim=1).cpu().detach().numpy()

        probs.append(prob)
        preds.append(pred)

preds = np.concatenate(preds, axis=0)
probs = np.concatenate(probs, axis=0)

predictions = torch.from_numpy(preds)

numCorrect = (predictions == resultTensorTest).sum().item()
acc = (100.0 * numCorrect / resultTensorTest.shape[0])

fpr, tpr, thresholds = metrics.roc_curve(resultTensorTest, probs, pos_label=1)
auroc = metrics.auc(fpr, tpr)

print(f"Accuracy: {acc} AUROC: {auroc}")

with open("submission2.csv", "w") as f:
    f.write("proba,label\n")
    for i in range(preds.shape[0]):
        f.write(f"{probs[i]},{preds[i]}\n")