In [None]:
# general
from __future__ import print_function, division
import os
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
import cv2
import random

import GPUtil

# torch
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils.data import random_split

from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

import torch.nn as nn
import torchvision.models as models
from pytorch_pretrained_bert import BertModel
from transformers import DistilBertModel
import torch.nn.functional as F

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [None]:
# Reproducibility
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic=True

### Structure
- plot images
- text preprocessing
- data loader
- image augmentation

### Plot images

In [None]:
dataset_df = pd.read_json('../data/train.jsonl', lines=True)

# plot a few images together with labels
nx = 3
ny = 3

fig, ax = plt.subplots(figsize=(16, ny*4), nrows=ny, ncols=nx)
for i in range(ny):
    for j in range(nx):
        img_id = dataset_df['id'].values[i*nx+ j]
        
        # catch trailing 0
        if img_id < 1e4:
            img = cv2.imread('../data/img/0'+str(img_id)+ '.png', cv2.IMREAD_GRAYSCALE)
        else:
            img = cv2.imread('../data/img/'+str(img_id)+ '.png', cv2.IMREAD_GRAYSCALE)  
        
        # plot and remove axes
        ax[i,j].imshow(img, cmap='gray')
        ax[i,j].set_title('Racist' if dataset_df['label'].values[i*nx+ j] else 'Non-Racist')
        ax[i,j].axes.get_xaxis().set_visible(False)
        ax[i,j].axes.get_yaxis().set_visible(False)
        
plt.show()

### Text Preprocessing

Bert requires: <br>
- tokenization
- special characters
- padding
- mask

techniques to try (for other models): <br>
- Lower casing
- Punctuation removal
- Stopwords removal
- Frequent words removal
- Rare words removal
- Spelling correction
- Tokenization
- Stemming
- Lemmatization
- word embedding/ bag of words

In [None]:
g

In [None]:
# dictionary
data = dataset_df.to_dict(orient='records') # have been shuffled in previous step

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(data))
val_size = len(data) - train_size

# train valid split
train_data, valid_data = random_split(data, [train_size, val_size])

#train_size = int(0.05 * len(valid_data))
#val_size = len(valid_data) - train_size
#valid_data, _ = random_split(valid_data, [train_size, val_size])

In [None]:
# split into text (input) and labels (output)
train_texts, train_imgs, train_labels = list(zip(*map(lambda d: (d['text'], d['img'], d['label']), train_data)))
valid_texts, valid_imgs, valid_labels = list(zip(*map(lambda d: (d['text'], d['img'], d['label']), valid_data)))

len(train_texts), len(train_labels), len(valid_texts), len(valid_labels)

In [None]:
# class imbalance
print(sum(train_labels)/ len(train_labels))
print(sum(valid_labels)/ len(valid_labels))

In [None]:
# analysis of review length
sentences = [len(sent) for sent in train_texts]
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})
plt.bar(range(1,len(train_texts)+1), sentences, color = ['red'])
plt.gca().set(title='No. of characters in each sentence', xlabel='Number of sentence', ylabel='Number of Characters in each sentence');

# -> bert with 64 tokens should be sufficient

In [None]:
# bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
# token embeddings with required separation token
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:62] + ['[SEP]'], train_texts))
valid_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:62] + ['[SEP]'], valid_texts))

len(train_tokens), len(valid_tokens)

In [None]:
# to be deleted -> SYNONYMES
print(train_tokens[0])
synonyme_idx = np.unique(np.random.randint(1, train_tokens[0].index('[SEP]'), int(0.2*train_tokens[0].index('[SEP]'))))
print([train_tokens[0][idx] for idx in synonyme_idx])
# remove stopwords and punctation
synonyme_idx = [idx for idx in synonyme_idx if not train_tokens[0][idx] in stopwords.words()]
synonyme_idx = [idx for idx in synonyme_idx if train_tokens[0][idx].isalnum()]

# find synonyms of words to be masked
replace_words = [train_tokens[0][idx] for idx in synonyme_idx] # don't use stopwords


print(replace_words)
synonymes = [wordnet.synsets(word) for word in replace_words]
synonyme_idx = [synonyme_idx[i] for i in range(len(synonymes)) if synonymes[i]!=[]]
synonymes = [synonymes[i] for i in range(len(synonymes)) if synonymes[i]!=[]]
print(synonyme_idx)

# replace words by synonyms
for i_syn, i_sen in enumerate(synonyme_idx):
    train_tokens[0][i_sen] = np.random.choice(synonymes[i_syn]).lemmas()[0].name()

print(train_tokens[0])

In [None]:
# put everything back into a dictionary
data_train = {'img_names': train_imgs, 'tokens': train_tokens, 'labels': train_labels}

data_valid = {'img_names': valid_imgs, 'tokens': valid_tokens, 'labels': valid_labels}

## Dataloader

In [None]:
class HatefulMemesDataset(Dataset):
    """ Hateful Memes dataset """
    
    def __init__(self, data, img_dir, normalize=False, synonyme=False, transform=None):
        """
        Args:
            df_path (string): path to jsonl file with image id's
            root_dir (string): directory with all the images
            transform (callable): optional transform to be applied on a sample
        """
        # text
        self.data = data
        
        # image
        self.img_dir = img_dir
        self.normalize = normalize
        self.transform = transform
        self.synonyme = synonyme
        
    def __len__(self):
        return len(self.data['labels'])
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx_tolist()
            
            
        data_train = {'img_names': train_imgs, 'tokens': train_tokens, 'labels': train_labels}
        
        # load image ------------------------------------
        img_name = os.path.join(self.img_dir, str(self.data['img_names'][idx]))
        image = cv2.imread(img_name)
        #image = image.astype(np.float)
        
        if self.normalize:
            # TODO: normalize image by mean and std -> what data type do networks require?
            print('TODO')
        
        # label ------------------------------------------
        label = self.data['labels'][idx]
        label = np.array([label])
        label = label.astype('int').reshape(-1)
        
        # token -------------------------------------------
        token = self.data['tokens'][idx]
        
        # add variance by synonyms
        if self.synonyme:
            synonyme_idx = np.unique(np.random.randint(1, token.index('[SEP]'), int(0.1*token.index('[SEP]'))))
            # remove stopwords and punctation
            synonyme_idx = [idx for idx in synonyme_idx if not token[idx] in stopwords.words()]
            synonyme_idx = [idx for idx in synonyme_idx if token[idx].isalnum()]

            # find synonyms of words to be masked
            replace_words = [token[idx] for idx in synonyme_idx] # don't use stopwords
            synonymes = [wordnet.synsets(word) for word in replace_words]
            synonyme_idx = [synonyme_idx[i] for i in range(len(synonymes)) if synonymes[i]!=[]]
            synonymes = [synonymes[i] for i in range(len(synonymes)) if synonymes[i]!=[]]

            # replace words by synonyms
            for i_syn, i_sen in enumerate(synonyme_idx):
                token[i_sen:i_sen+1] = tokenizer.tokenize(np.random.choice(synonymes[i_syn]).lemmas()[0].name())
        
        # prepare token ids: each token (word fragment) corresponds to an id in the bert corpus
        # further need to make all review the same length -> padding too short, truncating too long ones
        token_id = pad_sequences([tokenizer.convert_tokens_to_ids(token)], maxlen=64, truncating="post", padding="post", dtype="int")
        token_id = token_id.reshape(-1)
        
        # mask for padding -> required by bert
        mask = [float(i > 0) for i in token_id]
        mask = np.array([mask])
        mask = mask.astype('float').reshape(-1)
        
        #
        sample = {'image': image, 'token_id': token_id, 'mask': mask, 'label': label}
        
        if self.transform:
            sample = self.transform(sample)
            
        return sample

In [None]:
memes_dataset = HatefulMemesDataset(data_train, img_dir='../data', normalize=False)

fig = plt.figure(figsize=(16, 4))

for i in range(len(memes_dataset)):
    sample = memes_dataset[i]

    print(i, sample['image'].shape, sample['label'].shape)

    ax = plt.subplot(1, 4, i + 1)
    plt.tight_layout()
    ax.set_title('Sample #{}'.format(i))
    #ax.set_xlabel(sample['text'])
    #print(sample['token'])
    ax.axis('off')
    
    plt.imshow(sample['image'])

    if i == 3:
        plt.show()
        break

In [None]:
memes_dataset = HatefulMemesDataset(data_train, img_dir='../data', synonyme=True, normalize=False)

fig = plt.figure(figsize=(16, 4))

for i in range(len(memes_dataset)):
    sample = memes_dataset[i]

    print(i, sample['image'].shape, sample['label'].shape)

    ax = plt.subplot(1, 4, i + 1)
    plt.tight_layout()
    ax.set_title('Sample #{}'.format(i))
    #ax.set_xlabel(sample['text'])
    #print(sample['token'])
    ax.axis('off')
    
    plt.imshow(sample['image'])

    if i == 3:
        plt.show()
        break

## Image augmentation

In [None]:
# aspect ratio = 1.3 (width / height) median
# image dimensions = 128 x 128 (16k pixels) for CNN's

class Rescale(object):
    """Rescale the image in a sample to a given size
    Args:
        img_width (int): desired width of image
        max_distortion (float): maximum distortion of an image in a given direction
    """
    
    def __init__(self, size, max_distortion):
        self.size = size
        self.max_distortion = max_distortion
        
    def __call__(self, sample):
        image, token_id, mask, label = sample['image'], sample['token_id'], sample['mask'], sample['label']

        #
        width, height = image.shape[:2]
        aspect_ratio = width/ height
        
        # resulting distortion too high -> padding
        if aspect_ratio > self.max_distortion:
            pad = width- 2*height
            img = cv2.copyMakeBorder(image, int(pad/2.), int(pad/2.), 0, 0, cv2.BORDER_CONSTANT)
        elif 1./aspect_ratio > self.max_distortion:
            pad = height- 2*width
            img = cv2.copyMakeBorder(image, 0, 0, int(pad/2.), int(pad/2.), cv2.BORDER_CONSTANT)
        else:
            img = image.copy()
            
        # resize image
        #img = img.astype(np.float)
        img = transform.resize(img, (self.size, self.size))
        
        return {'image': img, 'token_id': token_id, 'mask': mask, 'label': label}

In [None]:
# cropping, scaling, rotation, noise, color?
class Transform(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """

    def __init__(self):
        pass
        
    def __call__(self, sample):
        image, token_id, mask, label = sample['image'], sample['token_id'], sample['mask'], sample['label']
        
        transform = transforms.Compose([transforms.ToPILImage(),
                                        transforms.RandomRotation(degrees=(-20, 20))])
        
        # TODO: these lines needed?
        img = (255.* image).astype(np.uint8)
        img = transform(img.astype(np.uint8))
        
        
        # TODO: use normalization as required by pretrained models
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        
        
        return {'image': img,'token_id': token_id, 'mask': mask, 'label': label}

In [None]:
# dataset
composed = transforms.Compose([Rescale(224, 2.),
                               Transform()])

memes_dataset = HatefulMemesDataset(data_train, img_dir='../data',
                                    normalize=False, transform=composed)

#plots
ncols = 7
nrows = 8
fig, ax = plt.subplots(figsize=(16, 16), ncols=ncols, nrows=nrows)

for i in range(ncols*nrows):
    x = i%ncols 
    y = int(i/ncols)
    
    sample = memes_dataset[i]

    ax[y,x].imshow(sample['image'])
    ax[y,x].set_title('Sample #{}'.format(i))
    ax[y,x].axis('off')

plt.show()

## Model 

In [None]:
# bert model class
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = DistilBertModel.from_pretrained('distilbert-base-cased')#BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, imgs, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba


In [None]:
# https://discuss.pytorch.org/t/combining-trained-models-in-pytorch/28383/2
class MyEnsemble(nn.Module):
    def __init__(self):
        super(MyEnsemble, self).__init__()
        # pretrained models
        self.modelVision = models.densenet121(pretrained=True)
        self.modelNLP = BertModel.from_pretrained('bert-base-cased')
        
        # freeze weights
        for param in self.modelNLP.parameters():
            param.requires_grad = False
            
        for param in self.modelVision.features.parameters():
            param.requires_grad = False
        
        # give densenet an untrained classifier layer
        self.modelVision.classifier = nn.Linear(1024, 1000)
        
        # classifier layer
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.layernorm1 = torch.nn.LayerNorm(768+1000)
        self.classifier1 = nn.Linear(768+ 1000, 1000)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.layernorm2 = torch.nn.LayerNorm(1000)
        self.classifier2 = nn.Linear(1000, 1)
        
    def forward(self, imgs, tokens, masks):
        # vision
        x1 = F.relu(self.modelVision(imgs)) # to complete the new classifier layer with a nonlinearity
        
        # Bert is such that the first token contains all info for classification
        _, x2 = self.modelNLP(tokens, attention_mask=masks, output_all_encoded_layers=False)
        
        # TODO: dropout needed?
        x = torch.cat((x1, x2), dim=1)
        x = self.layernorm1(x)
        x = F.relu(self.classifier1(x))
        x = self.layernorm2(x)
        y = torch.sigmoid(self.classifier2(x))
        
        return y

In [None]:
# allocate on gpu
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

model = MyEnsemble().to(device)
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')

GPUtil.showUtilization()

print(sum(p.numel() for p in model.parameters()))
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

In [None]:
from torch.optim import Adam
# TODO: not sure if this does what I think it should do
# optimizer
param_optimizer = list(model.modelVision.classifier.named_parameters())
param_optimizer += list(model.classifier1.named_parameters())
param_optimizer += list(model.classifier2.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

lr = 1e-4
optimizer = Adam(model.parameters(), lr=lr) # TODO: SGD for just the final classifier

In [None]:
print([p[1].size() for p in param_optimizer])

In [None]:
'''
densenet = models.densenet121(pretrained=True)
for name, child in densenet.named_children():
    print(name)
    for param in child.parameters():
            print(param.shape)
print(sum(p.numel() for p in densenet.parameters()))
'''

In [None]:
'''
bert = BertModel.from_pretrained('bert-base-uncased')
for name, child in bert.named_children():
    print(name)
    for param in child.parameters():
            print(param.shape)
'''

In [None]:
def weighted_binary_cross_entropy(output, target, weights=None):
        
    if weights is not None:        
        # TODO: should work since target = {0, 1}, thus sets weights to zero if not needed
        loss = weights * (target * torch.log(output)) + \
               weights * ((1 - target) * torch.log(1 - output))
    else:
        loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)

    return torch.neg(torch.mean(loss))

In [None]:
from datetime import date
# summary writer
log_dir = './summaries/summary'+ date.today().strftime('%H-%d-%m-%Y')
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir)

## Training

In [None]:
from torch.nn.utils import clip_grad_norm_
from sklearn.metrics import roc_auc_score

# training
torch.cuda.empty_cache()   # Clearing Cache space for a fresh Model run

EPOCHS=14
BATCH_SIZE_TRAIN=8 # 4 if grad for all paramters
BATCH_SIZE_VALID=4

# training
train_dataset = HatefulMemesDataset(data_train, img_dir='../data',
                                    normalize=False, synonyme=True, transform=Rescale(224, 2.))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, num_workers=1, shuffle=True)

# validation
valid_dataset = HatefulMemesDataset(data_valid, img_dir='../data',
                                    normalize=False, transform=Rescale(224, 2.))
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE_VALID, shuffle=False)

# binary cross entropy loss (classification)
loss_func = nn.BCELoss()

# monitor train progress
stats = {'train_loss': [], 'train_acc': [], 'valid_loss': [], 'valid_acc': [], 'val_rocauc': []}

time_tot = []

for epoch_num in range(EPOCHS):
    #
    train_loss = 0.
    train_correct = 0.
    
    valid_loss = 0.
    valid_correct = 0.
    
    # training
    model.train()
    
    # unfreeze weights
    if epoch_num > EPOCHS -3:
        for param in model.modelNLP.parameters():
            param.requires_grad = True
            
        for param in model.modelVision.features.parameters():
            param.requires_grad = True
            
        # Not sure if this is necessary (at least should not be harmful)
        optimizer = Adam(model.parameters(), lr=lr)
        
        # not so elegant, but allows to first use a larger batch size
        #lr = 3e-6
        BATCH_SIZE_TRAIN=4
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, num_workers=1, shuffle=True)

        
    for step_num, batch in enumerate(train_loader):
        
        # sample = {'image': image, 'token_id': token_id, 'mask': mask, 'label': label}        
        imgs = batch['image'].to(device)
        imgs = imgs.view(-1, 3, 224, 224)
        
        labels = batch['label'].to(device)
        masks = batch['mask'].to(device)
        token_ids = batch['token_id'].to(device)
        # imgs, token_ids, masks, labels = tuple(t.to(device) for t in batch)
        # print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        
        logits = model(imgs.float(), token_ids, masks)
        
        # account for class imbalance (66% label 0, 33% label 1)
        eye = torch.ones(labels.shape, device=device)
        weight_imbalance = (eye+ (labels==eye).int())* 0.63
   
        # loss
        batch_loss = weighted_binary_cross_entropy(logits, labels.float(), weight_imbalance)
        train_loss += batch_loss.item()
        
        train_correct += sum((torch.round(logits)==labels)*weight_imbalance).item()
        
        # reset gradient and calculate new ones
        model.zero_grad()
        batch_loss.backward()
        
        # gradient clipping and backward pass
        #clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()

        # logging
        # clear_output(wait=True) -> from IPython.display import clear_output
        if step_num%100==0 and step_num> 0:
            print('Epoch: ', epoch_num + 1)
            print("\r" + "{0}/{1} loss: {2} acc: {3} ".format(step_num,
                                                              len(train_data) / BATCH_SIZE_TRAIN,
                                                              round(train_loss / (step_num + 1), 3),
                                                              round(train_correct/ ((step_num+1)* BATCH_SIZE_TRAIN), 3)))
    
    # train stats
    stats['train_loss'].append(train_loss / ((step_num+1)* BATCH_SIZE_TRAIN))
    stats['train_acc'].append(train_correct / ((step_num+1)* BATCH_SIZE_TRAIN))
    
    writer.add_scalar('Loss/train', train_loss / ((step_num+1)* BATCH_SIZE_TRAIN), epoch_num)
    writer.add_scalar('Accuracy/train', train_correct / ((step_num+1)* BATCH_SIZE_TRAIN), epoch_num)
    
    # validation
    torch.cuda.empty_cache()
    model.eval()
    
    # TODO to calculate metrics
    ypred = []
    ytrue = []
    weight = []

    for step_num, batch in enumerate(valid_loader):
        # sample = {'image': image, 'token_id': token_id, 'mask': mask, 'label': label}        
        imgs = batch['image'].to(device)
        imgs = imgs.view(-1, 3, 224, 224)

        labels = batch['label'].to(device)
        masks = batch['mask'].to(device)
        token_ids = batch['token_id'].to(device)

        # account for class imbalance (66% label 0, 33% label 1)
        eye = torch.ones(labels.shape, device=device)
        weight_imbalance = (eye+ (labels==eye).int())* 0.63
        
        # loss
        logits = model(imgs.float(), token_ids, masks)

        batch_loss = weighted_binary_cross_entropy(logits, labels.float(), weight_imbalance)
        valid_loss += batch_loss.item()
        
        valid_correct += sum((torch.round(logits)==labels)*weight_imbalance).item()
        
        # ROC AUC
        ypred += logits.cpu().detach().numpy().reshape(-1).tolist()
        
        ytrue += labels.cpu().detach().numpy().reshape(-1).tolist()
        weight += weight_imbalance.cpu().detach().numpy().reshape(-1).tolist()

    # logging
    # clear_output(wait=True) -> from IPython.display import clear_output
    print('Epoch: ', epoch_num + 1)
    print("\r" + "Validation loss: {0} acc: {1} ".format(round(valid_loss / ((step_num+1)* BATCH_SIZE_VALID), 3),
                                                         round(valid_correct/ ((step_num+1)* BATCH_SIZE_VALID), 3)))

    # valid stats
    stats['valid_loss'].append(valid_loss / ((step_num+1)* BATCH_SIZE_VALID))
    stats['valid_acc'].append(valid_correct / ((step_num+1)* BATCH_SIZE_VALID))
    stats['val_rocauc'].append(roc_auc_score(ytrue, ypred, average='weighted', sample_weight=weight))
    
    writer.add_scalar('Loss/valid', valid_loss / ((step_num+1)* BATCH_SIZE_VALID), epoch_num)
    writer.add_scalar('Accuracy/valid', valid_correct / ((step_num+1)* BATCH_SIZE_VALID), epoch_num)
    writer.add_scalar('ROCAUC/valid', roc_auc_score(ytrue, ypred, average='weighted', sample_weight=weight), epoch_num)
    
    # optimizer lr decay
    lr*= 0.9
    optimizer = Adam(model.parameters(), lr=lr)
    
writer.close()

In [None]:
fig, ax = plt.subplots(figsize=(12, 4), ncols=3)
ax[0].plot(stats['train_loss'])
ax[0].plot(stats['valid_loss'])
ax[0].legend(['train', 'valid'])
ax[0].set_ylabel('Loss')
ax[0].set_yscale('log')

ax[1].plot(stats['train_acc'])
ax[1].plot(stats['valid_acc'])
ax[1].legend(['train', 'valid'])
ax[1].set_ylabel('Accuracy')

ax[2].plot(stats['val_rocauc'])
ax[2].legend(['valid'])
ax[2].set_ylabel('ROC AUC')

plt.savefig('../classifier_11_10.png')
plt.show()

In [None]:
asdfsd

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

conf_matr = confusion_matrix(ytrue, np.round(np.asarray(ypred)), sample_weight=weight)

# predicted / True
print(conf_matr.astype('int'))

In [None]:
# loader
valid_dataset = HatefulMemesDataset(data_valid, img_dir='../data',
                                    normalize=False, transform=Rescale(224, 2.))
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE_VALID, shuffle=False)

# TODO to calculate metrics
ypred = []
ytrue = []
weight = []

for step_num, batch in enumerate(valid_loader):
    # sample = {'image': image, 'token_id': token_id, 'mask': mask, 'label': label}        
    imgs = batch['image'].to(device)
    imgs = imgs.view(-1, 3, 224, 224)

    labels = batch['label'].to(device)
    masks = batch['mask'].to(device)
    token_ids = batch['token_id'].to(device)

    # account for class imbalance (66% label 0, 33% label 1)
    eye = torch.ones(labels.shape, device=device)
    weight_imbalance = (eye+ (labels==eye).int())* 0.63

    # loss
    logits = model(imgs.float(), token_ids, masks)

    batch_loss = weighted_binary_cross_entropy(logits, labels.float(), weight_imbalance)
    valid_loss += batch_loss.item()

    valid_correct += sum((torch.round(logits)==labels)*weight_imbalance).item()

    # ROC AUC
    ypred += logits.cpu().detach().numpy().reshape(-1).tolist()
    ytrue += labels.cpu().detach().numpy().reshape(-1).tolist()
    weight += weight_imbalance.cpu().detach().numpy().reshape(-1).tolist()
    
    #plots
    for i in range(BATCH_SIZE_VALID):
        if labels[i].item() != int(logits[i].item()):
            plt.figure()
            
            print(imgs[i].cpu().shape)
            plt.imshow(imgs[i].cpu().reshape(224,224,-1), cmap='gray')
            plt.title('Label '+ str(labels[i].cpu().item())+ ' Predicted '+ str(int(logits[i])))
            plt.show()

In [None]:
from sklearn.metrics import roc_curve

lr_fpr, lr_tpr, _ = roc_curve(ytrue, ypred)
# plot the roc curve for the model
plt.figure()
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.show()

## Predictions

In [None]:
# test data for competition
path_test = '../data/test.jsonl'

# read text
dataset_test = pd.read_json(path_test, lines=True)

# dictionary
data_test = dataset_test.to_dict(orient='records') # have been shuffled in previous step

# split into text (input) and labels (output)
test_texts, test_imgs, test_ids = list(zip(*map(lambda d: (d['text'], d['img'], d['id']), data_test)))
test_labels = np.zeros(len(test_ids)) # TODO: hack to use current implementation of Dataset class

print(len(test_texts))

# token embeddings with required separation token
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:62] + ['[SEP]'], test_texts))

# prepare token ids
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=64, truncating="post", padding="post", dtype="int")

# mask for padding -> required by bert
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# put everything back into a dictionary
data_test = {'img_names': test_imgs, 'tokens': test_tokens, 'token_ids': test_tokens_ids,
              'masks': test_masks, 'labels': test_labels}


# test
test_dataset = HatefulMemesDataset(data_test, img_dir='../data',
                                    normalize=False, transform=Rescale(224, 2.)) # TODO: I somehwere read that image models need 224x224 input
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE_VALID, shuffle=False)

In [None]:
print(test_ids[0])
#d4 = dict(d1, **d2)

In [None]:
# prepare hand-in dataframe
proba = []
label = []

# validation
model.eval()
torch.cuda.empty_cache()

for step_num, batch in enumerate(test_loader):
    # sample = {'image': image, 'token_id': token_id, 'mask': mask, 'label': label}        
    imgs = batch['image'].to(device)
    imgs = imgs.view(-1, 3, 224, 224)

    masks = batch['mask'].to(device)
    token_ids = batch['token_id'].to(device)

    logits = model(imgs.float(), token_ids, masks)

    #
    proba += logits.cpu().detach().numpy().reshape(-1).tolist()
    label += torch.round(logits).cpu().detach().numpy().reshape(-1).tolist()
    
# 
#proba = np.round(np.asarray(proba), 2)
label = (np.asarray(label)).astype(int)


In [None]:
#
dictt = {'id': test_ids, 'proba': proba, 'label': label}

df = pd.DataFrame(data=dictt)

In [None]:
print(df)

In [None]:
df.to_csv('../data/pred_10_10', index=False)

In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

In [None]:
syns = wordnet.synsets("program")
wordnet.synsets("program")

In [None]:
print(syns)
print(wordnet.synsets("program")[np.random.randint(len(syns))].lemmas()[0].name())
print(np.random.choice(wordnet.synsets("program")).lemmas()[0].name())

In [None]:
syns = wordnet.synsets("program")
print(syns)

In [None]:
print(syns[0].lemmas())