## 0. IMPORTING/INSTALLING NECESSARY LIBRARIES

In [1]:
# sklearn and pandas were not available for me on mltgpu so uncomment this if you cannot import it
# pip install sklearn
# pip install pandas

In [1]:
from pycocotools.coco import COCO
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from PIL import Image
import numpy as np
import pandas as pd
import math
import transformers
from transformers import BertTokenizerFast
from transformers import BertModel
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim
datadir="/scratch/lt2316-h18-resources/coco/train2017"
from sklearn.preprocessing import StandardScaler
import pickle
# importing the Python file with the model definition, training loop, testing loop, and some other functions
import modelA2 as ma2

## 1. COLLECTING TRAINING DATA  

Keep in mind that many of those cells do *not* need to be re-run; data was saved at different stages to make it easier to work on the assignment without having to re-run everything. 

In [2]:
# loading in the file with category names for the images (and the images)
coco = COCO(annotation_file="/scratch/lt2316-h18-resources/coco/annotations/instances_train2017.json")

loading annotations into memory...
Done (t=20.40s)
creating index...
index created!


In [3]:
# loading in the file with captions for the images (and the images)
coco_caps = COCO(annotation_file="/scratch/lt2316-h18-resources/coco/annotations/captions_train2017.json")

loading annotations into memory...
Done (t=1.27s)
creating index...
index created!


In [4]:
def get_coco_data(categories, coco):
    # categories should be a string of categories, separated by commas and spaces, e.g. "cat, dog". Can be left empty ("") if 
    # all categories are desired
    
    # retrieving a list of category IDs for the categories listed in the input
    all_cats = coco.getCatIds(catNms=categories)
    
    # creating a list of the image IDs of all the images that belong to the categories defined above
    all_img_ids = []
    for id_nr in all_cats:
        img_ids = coco.getImgIds(catIds=id_nr)
        all_img_ids += img_ids
        
    # removing duplicates (images can belong to multiple categories; this was tested and the list is much shorter like this)
    all_img_ids = list(dict.fromkeys(all_img_ids))
    
    return all_img_ids

In [5]:
def get_coco_with_anns(categories, coco, coco_caps):
    # same as above, categories should be a comma-separated string of categories
    # coco is the main coco object, coco_caps is the one with captions
    
    # retrieving the image IDs of all the images belonging to the selected categories
    all_img_ids = get_coco_data(categories, coco)
    
    # initializing the dictionary to be returned
    coco_with_anns = {}
    
    # retrieving the necessary information
    # accessing the captions and annotations inspired by https://leimao.github.io/blog/Inspecting-COCO-Dataset-Using-COCO-API/
    for img_id in all_img_ids:
        # loading the image info
        coco_info = coco.loadImgs(img_id)
        # loading the caption info
        cap_ids = coco_caps.getAnnIds(imgIds=[img_id], iscrowd=None)
        coco_captions = coco_caps.loadAnns(cap_ids)
        # saving the file name
        file_name = coco_info[0]['file_name']
        # saving the captions
        captions = []
        for cap in coco_captions:
            captions.append(cap['caption'])
        # saving the categories
        classes = []
        ann_ids = coco.getAnnIds(imgIds=[img_id], iscrowd=None)
        coco_anns = coco.loadAnns(ann_ids)
        for ann in coco_anns:
            if ann['category_id'] not in classes:
                classes.append(ann['category_id'])
            
        coco_with_anns[img_id] = {'file_name': file_name, 'captions': captions, 'categories': classes}
        
    return coco_with_anns
    

In [6]:
def create_samples(categories, coco, coco_caps, smart=False):
    # the explanations for categories and cocos are the same as above; if smart=False then the negative class is created in a 
    # non-smart way way; if it's True, it's in a smart way
    # additionally, this will not always return the same size sample collections even with identical input, as this depends on
    # the random shuffling and the overlap of names or classes after that
    
    coco_with_anns = get_coco_with_anns(categories, coco, coco_caps) 
    # creating the true class samples
    true_samples = []
    # every key is matched with every of its samples, the whole thing is given 1 as a category (100% probability of matching)
    for key in coco_with_anns.keys():
        for caption in coco_with_anns[key]['captions']:
            true_samples.append((coco_with_anns[key]['file_name'], caption, 1))
    
    
    if smart==False:
        # creating the stupid false class samples
        false_samples = []
        names = []
        captions = []
        # creating lists of id-name and id-caption pairs, so that after reshuffling these can be identified if they accidentally
        # make a good match
        for key in coco_with_anns.keys():
            names.append((key, coco_with_anns[key]['file_name']))
            for caption in coco_with_anns[key]['captions']:
                captions.append((key, caption))
        # shuffling the captions so that they are randomly assigned to the image names
        random.shuffle(captions)
        # matching every name to the five subsequent captions (so name 0 gets captions 0 through 4, name 1 - 5 through 9, etc.)
        for i in range(0,len(names)):
            name = names[i]
            for j in range(0,5):
                caption = captions[i*5+j]
        # appending the new mismatched name-caption pair as well as class 0 to the list of false samples
                if name[0] != caption[0]:
                    false_samples.append((name[1], caption[1], 0))
                    
    else:
        # creating the stupid false class samples
        false_samples = []
        names = []
        captions = []
        # creating lists of id-categories-name and id-categories-caption tuples, so that after reshuffling these can be 
        # identified if they accidentally make a good match in terms of id or have any category overlap 
        for key in coco_with_anns.keys():
            names.append((key, coco_with_anns[key]['categories'], coco_with_anns[key]['file_name']))
            for caption in coco_with_anns[key]['captions']:
                captions.append((key, coco_with_anns[key]['categories'], caption))
        # shuffling the captions so that they are randomly assigned to the image names
        random.shuffle(captions)
        # matching every name to the five subsequent captions (so name 0 gets captions 0 through 4, name 1 - 5 through 9, etc.)
        for i in range(0,len(names)):
            name = names[i]
            for j in range(0,5):
                caption = captions[i*5+j]
        # initializing a variable tracking if there is overlap between categories of the image and caption
                match=False
                if name[0] != caption[0]:
                    for category in name[1]:
                        if category in caption[1]:
                            match=True
                    if match==False:  # this will not run if there was any overlap of categories
        # appending the image name, caption, and class 0 only if neither image id nor categories were overlapping
                        false_samples.append((name[2], caption[2], 0))
    
    # shuffling both lists
    random.shuffle(true_samples)
    random.shuffle(false_samples)
    # determining the max length of the sample list since I want the function to return lists of equal length
    # because of how the smart version works, the true samples may need to be trimmed by a lot; with the stupid version
    # the difference is not that large
    max_length = len(false_samples)
    trimmed_true_samples = true_samples[:max_length]
    
    samples = (false_samples, trimmed_true_samples)
    return samples

In [31]:
# generating and saving the samples with smart=True as this has proven to work better for the model
smart_samples = create_samples('', coco, coco_caps, smart=True)
pickle.dump(smart_samples, open('all_smart_samples.pickle', 'wb'))

In [5]:
# loading in the samples
smart_samples = pickle.load(open('all_smart_samples.pickle', 'rb'))

In [7]:
def samples_splits(true_samples, false_samples, max_size, train_split=0.7, test_split=0.15, val_split=0.15):
    # creating a function to mix and split the data according to parameters introduced by the user
    
    # avoiding potential errors
    if (train_split + test_split + val_split) != 1:
        print('Invalid data split proportions!')
        return
    if (len(true_samples) + len(false_samples)) < max_size:
        print('Not enough samples to create splits of the required max size!')
        return
        
    # combining the positive and negative samples and shuffling them
    all_samples = true_samples + false_samples
    random.shuffle(all_samples)
    # selecting the total requested size of the data
    selected_samples = all_samples[:max_size]
    
    # retrieving the actual image data, inspired by Demo 1
    # creating independent lists of images, captions, and classes; order is maintained so they can be reassembled into samples
    image_samples = []
    images = []
    captions = []
    classes = []
    for sample in selected_samples:
        name = sample[0]
        captions.append(sample[1])
        classes.append(sample[2])
        # retrieving images
        image = Image.open("{}/{}".format(datadir, name)).resize((100,100))
        image = image.convert('RGB')
        image = np.array(image)
        images.append(image)
    # scaling the images    
    image_array = np.array(images)
    image_array_scaled = StandardScaler().fit_transform(image_array.reshape(len(image_array),30000)).reshape(len(image_array), 100, 100, 3)
    # reassembling the samples
    for i in range(0,max_size):
        sample = (image_array_scaled[i], captions[i], classes[i])
        image_samples.append(sample)

    # creating the splits according to the given proportions
    test_size = math.ceil(max_size * test_split)
    val_size = math.ceil(max_size * val_split)
    test_samples = image_samples[:test_size]
    val_samples = image_samples[test_size:(test_size + val_size)]
    train_samples = image_samples[(test_size + val_size):]
    complete_samples = (train_samples, val_samples, test_samples)
    
    return train_samples, val_samples, test_samples
    

In [8]:
class CaptionsDataset(Dataset):
    # creating a class that will let me later encode and load the data into batches
    # this is inspired by how we did dataloaders for the LT2213 V22 Komputationell semantik class; the assignments there were
    # group work and one of my group members, Sarab Youssef, is taking this course, which means it is likely that her approach
    # to this will be similar; if it is of any consolation, the dataloading for BERT was my section of that assignment :)
    def __init__(self, data):
        # loading the list of tuples into a Pandas dataframe
        self.data = pd.DataFrame(data)

    def __getitem__(self, idx):
        # defining how data is returned when the dataset is indexed
        item = self.data.iloc[idx]
        image = item[0]        
        caption = str(item[1])
        cls = item[2]
        
        return {
            'image': image,
            'caption': caption,
            'cls': cls
        }
        
    def __len__(self):
        # defining how the length of the dataset is defined
        return len(self.data)

In [9]:
def save_datasets(train_samples, test_samples, val_samples, file_name):
    # this function allows for creating and saving datasets based on specific sample splits;
    # in this case, I believe, the class itself does have to be re-run for it to work properly
    train_dataset = CaptionsDataset(train_samples)
    test_dataset = CaptionsDataset(test_samples)
    val_dataset = CaptionsDataset(val_samples)
    
    datasets = (train_dataset, test_dataset, val_dataset)
        
    pickle.dump(datasets, open(file_name, 'wb'))
    
def load_datasets(file_name):
    # this function allows for loading in of previously saved datasets
    train_dataset, test_dataset, val_dataset = pickle.load(open(file_name, 'rb'))
    
    return train_dataset, test_dataset, val_dataset

In [None]:
# https://stats.stackexchange.com/questions/226672/how-few-training-examples-is-too-few-when-training-a-neural-network
# saving datasets of multiple different sizes; I have done 1k and 2k, with 1k being much faster to save, load in, and train,
# but 2k yielding better resultds; I tried generating larger sizes, like 10k or 20k, but these were getting insanely large and 
# too cumbersome to save and load also given the current state of the server; this is, however, definitely a bottleneck.

In [9]:
smart_train_samples2k, smart_val_samples2k, smart_test_samples2k = samples_splits(smart_samples[1], smart_samples[0], 2000)

In [10]:
save_datasets(smart_train_samples2k, smart_test_samples2k, smart_val_samples2k, 'smart_datasets2k.pickle')

In [10]:
bert_model = BertModel.from_pretrained(model_name, return_dict=True, output_hidden_states=True)
bert_model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
def get_embeddings(input_ids, attention_mask, bert_device):
    # a function to retrieve the word embeddings from BERT, required BERT to be initialized as bert_model beforehand and put
    # in eval mode (see above), all of that with return_hidden_states=True
    # inspired by https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings
    with torch.no_grad():
        # getting the parts of the batch that are relevant
        bert_model.to(bert_device)
        output = bert_model(input_ids, attention_mask)
        # hidden states for every layer are stored here
        hidden_states = output[2]
        # there are 13 elements in the hidden states, meaning the initial embeddings and hidden states from 12 layers;
        # according to the link I provided above, there are mant approaches for what elements of these to use as word
        # embeddings; I will use the second-to-last hidden layer
        penultimate_layer = hidden_states[11]
        # the penultimate layer now has the shape of batch * max_len of the sentence * 768 (BERT embedding size)
    return penultimate_layer

In [12]:
class CaptionsCollate():
    # custom collate class also based on the assignments from the course mentioned in the CaptionsDataset class comments
    def __init__(self, tokenizer, datadir, device, bert_device, max_token_len=16):
        # defining the tokenizer and the maximum length of the "sentence"
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.device = device
        self.bert_device = bert_device
        
    def __call__(self, batch):
        # performing the necessary operations on every batch
        images = []
        captions = []
        classes = []
        
        for element in batch:
            # transforming every image from numpy to a torch tensor, appending to a list
            image = element['image']
            image_tensor = torch.from_numpy(image)
            images.append(image_tensor) 
            # appending captions and classes to a list, captions will be further processed, classes no so they are turned to
            # torch tensors immediately (which is possibly unnecessary)
            captions.append(element['caption']) 
            classes.append(torch.tensor([element['cls']]))

        # calling the BERT tokenizer on the captions
        tokens = self.tokenizer(captions, 
                                            add_special_tokens=True, 
                                            return_tensors='pt',
                                            truncation=True, 
                                            max_length=self.max_token_len, 
                                            padding=True, 
                                            return_attention_mask=True,
                                            is_split_into_words=False)
        
        
        # all of the output is moved to device, classes are stacked since it has to be a tensor, not a list
        input_ids=tokens['input_ids'].to(self.bert_device)
        attention_masks=tokens['attention_mask'].to(self.bert_device)
        # calling the embeddings function since we want to feed the network the BERT embeddings; make sure that BERT is
        # on the same device as the rest before that
        bert_embeddings = get_embeddings(input_ids, attention_masks, self.bert_device).to(self.device)  
        classes = torch.stack(classes).to(self.device)
        images = torch.stack(images).to(self.device)
        
        return images, bert_embeddings, classes

In [13]:
def captions_dataloader(dataset, tokenizer, datadir, device, bert_device, batch_size=32, shuffle=True): 
    # little function to easily obtain dataloaders using our dataset and custom Collate class
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=CaptionsCollate(tokenizer, datadir, device, bert_device) )
    return loader

In [14]:
def save_dataloaders(dataset, tokenizer, datadir, device, bert_device, epochs, file_name, batch_size=32, shuffle=True):
    # once again, a function to generate a specific number of dataloaders for later training
    loaders = []
    for i in range(0, epochs):
        loader = captions_dataloader(dataset, tokenizer, datadir, device, bert_device, batch_size=batch_size, shuffle=shuffle)
        loaders.append(loader)
        
    pickle.dump(loaders, open(file_name, 'wb'))
    
def load_dataloaders(file_name):
    # a function to load previously saved loaders
    loaders = pickle.load(open(file_name, 'rb'))
    
    return loaders

In [16]:
# loading in the datasets
smart_train_dataset2k, smart_test_dataset2k, smart_val_dataset2k = load_datasets('smart_datasets2k.pickle')

In [17]:
# saving the dataloaders (20 epochs; 10 was not performing good at all for 2k, but was decent for 1k)
save_dataloaders(smart_train_dataset2k, tokenizer, datadir, 'cpu', 'cuda:3', 20, 'smart_train_loaders2k.pickle', batch_size=8)

In [18]:
# shuffle is turned off for the test dataloader as then we can explore the results better
save_dataloaders(smart_test_dataset2k, tokenizer, datadir, 'cpu', 'cuda:3', 1, 'smart_test_loaders2k.pickle', batch_size=8, shuffle=False)

## 2. MODELLING  
This whole section can be found in the modelA2.py file, as requested in the assignment description. I commented out the code here but still kept it for quicker reference, if needed. The commented out code may not be up to date so **please consult the original file for the most up-to-date code**.

`class ImageCaptionClassifier(nn.Module):
    # class of the model itself; while initializing it needs to be told the desired size of the hidden layer as well as the
    # dimensions of the RGB images it will be fed
    def __init__(self, hidden_size, height, width):
        super(ImageCaptionClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.height = height
        self.width = width`
        
        # defining the three components of the model: layers that will process the image, a layer that will process text, and
        # a layer to process the combined outputs of the previous two
        
        # the image processing layers are mostly taken from Demo 1
        self.layers_image = nn.Sequential(
            nn.Conv2d(3, 3, 3, padding=2), 
            nn.BatchNorm2d(3),
            nn.ReLU(),  # I decided to keep these (this and MaxPool2d) in as they will reduce the size of the output
            nn.MaxPool2d(2, 2),
            nn.Tanh()
        )
        
        # the text processing layer is a bidirectional LSTM; the input size here is the size of BERT embeddings, and the 
        # hidden size is tailored to, in the end, be comparable in size to the output from the image layers
        self.layers_text = nn.LSTM(input_size=768, hidden_size=4000, num_layers=1, batch_first=True, bidirectional=True)
        
        # the classification layers are linear layers of decreasing size with different activation functions between them,
        # concluding with reducing the output to 1 and feeding it to a Sigmoid function for a "probability" spread of 0 to 1
        self.layers_classification = nn.Sequential(
            nn.Linear(int(((self.height/2+1)*(self.width/2+1)*3)+8000), self.hidden_size),
            nn.Dropout(0.1),
            nn.LeakyReLU(),
            nn.Linear(self.hidden_size, int(self.hidden_size / 2)),
            nn.LeakyReLU(),
            nn.Linear(int(self.hidden_size / 2), int(self.hidden_size / 4)),
            nn.Tanh(),
            nn.Linear(int(self.hidden_size / 4), 1),
            nn.Sigmoid()            
        )
        
    def forward(self, images, bert_embeddings, device):
        # for calling the model we need to input the images and BERT embeddings from the batch
        # since I have had issues with tensors being 64bit FloatTensors, and not 32bit ones, this part is essential in making
        # it work and it relies on whether it runs on CPU or one of the GPUs
        # the images batches are also restructured to fit what the convolutional layer expects, as per Demo 1
        #if self.device == 'cpu':
        converted_images = images.permute(0, 3, 1, 2).type(torch.FloatTensor).to(device)  
        converted_embeddings = bert_embeddings.type(torch.FloatTensor).to(device)
        #else:
            #print('aha!')
            #converted_images = images.permute(0, 3, 1, 2).type(torch.cuda.FloatTensor).to(self.device)  
            #converted_embeddings = bert_embeddings.type(torch.cuda.FloatTensor).to(self.device)
        
        # the images are fed through the image layers and then reshaped to have fewer dimensions
        processed_images = self.layers_image(converted_images)
        flattened_images = processed_images.reshape(-1, int((self.height/2+1)*(self.width/2+1)*3))
        
        # the captions are fed through the LSTM, and the final hidden states from both directions are combined to form a 
        # sentence representation
        timestep_representation, (final_hidden, final_cell) = self.layers_text(converted_embeddings)
        processed_embeddings = torch.cat((final_hidden[0, :, :], final_hidden[1, :, :]), dim=1)
        
        # data from both of the above is combined and fed to the classification layer
        combined_data = torch.cat((flattened_images, processed_embeddings), dim=1)
        output = self.layers_classification(combined_data)
        
        return output`

`def train(loaders, device, hidden_size=7000, height=100, width=100, model=None):
    # this function trains a given model (or, but default, an image caption classifier model)
    if not model:
        m = ImageCaptionClassifier(hidden_size, height, width).to(device)
    else:
        m = model.to(device)`
    
    # BCELoss is used as this is a binary classification problem
    loss = nn.BCELoss()
    optimizer = optim.Adam(m.parameters(), lr=0.00005)
    
    for i in range(0, len(loaders)):
        # len(loaders) is the number of epochs that we set while generating the loaders; it'd be easier to set the number of
        # epochs had this function not had to be in a separate Python file, where I cannot make it dependent on so much
        # of the data processing steps; I would just call the dataloader here separately for every epoch. one of the problems
        # is that the dataloader has to use BERT for embeddings too.
        loader = loaders[i]
        tot_loss = 0
        for j, batch in enumerate(loader):
            # iterating over the loader batch by batch,
            o = m(batch[0], batch[1], device)
            l = loss(o, batch[2].type(torch.FloatTensor).to(device))
            tot_loss += l
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            if j%25==0:
                print('\tStill training...')
                
        print("Total loss in epoch {} is {}.".format(i, tot_loss))

    return m


`def save_model(model, file_name):
    # a small function intended to be used to save a trained model
    pickle.dump(model, open(file_name, 'wb'))`
        
`def load_model(file_name):
    # a small function intended to be used to load a trained model
    model = pickle.load(open(file_name, 'rb'))`
    
    return model

`def test_model(model, loaders, device):
    # this function puts a given model in eval mode and then runs it over "one epoch" of the test data, saving, as lists,
    # the predictions and true classes.
    model.to(device)
    model.eval()  # setting model in eval mode
    all_predictions = []
    all_classes = []
    # this with-statement is recommended for further making sure that the model is not learning from the test data 
    with torch.no_grad():
        for i in range(0, len(loaders)):
            loader = loaders[i]
            for j, batch in enumerate(loader):
                a = batch[0].type(torch.FloatTensor).to(device)
                b = batch[1].type(torch.FloatTensor).to(device)`
                
                o = model(a, b, device)
                predictions = torch.squeeze(o).tolist()  # making sure that we get predictions in the correct format
                # encoding the predictions to reflect not probabilities, but classes; anything above 0.5 probability is
                # considered to be a matching image-caption pair (class 1), and below to not be considered that (class 0).
                for i in range(0, len(predictions)):
                    if predictions[i] > 0.5:
                        predictions[i] = 1
                    else:
                        predictions[i] = 0
                all_predictions += predictions
                all_classes += torch.squeeze(batch[2]).tolist()
    
    print('Testing complete!')
        
    return all_predictions, all_classes


`def measures(all_predictions, all_classes):
    # this function goes over the output of the testing function and provides some basic evaluation measures, such as
    # accuracy, recall, precision, and f1. 
    print('The following measures have been recorded for this model:')
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    # counting up the true and false positives and negatives
    for i in range(0, len(all_predictions)):
        if all_predictions[i] == all_classes[i]:
            if all_predictions[i] == 1:
                tp += 1
            else:  # if == 0
                tn += 1
        else:  # if not the same
            if all_predictions[i] == 1:
                fp += 1
            else:
                fn += 1`
    
    # accuracy can always be calculated
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    print(f'\tAccuracy = {accuracy}')
    
    # the following measures cannot always be calculated, so certain conditions are checked for first
    if tp == 0 and fn == 0:
        print('No true positives or false negatives have been recorded, impossible to calculate recall!')
    else:
        recall = tp / (tp + fn)
        print(f'\tRecall = {recall}')
    if tp == 0 and fp == 0:
        print('No true or false positives have been recorded, impossible to calculate precision!')
    else:
        precision = tp / (tp + fp)
        print(f'\tPrecision = {precision}')
    
    if (tp == 0 and fn == 0) or (tp == 0 and fp == 0):
        print('Impossible to calculate f1!')
    else:
        f1 = (2 * recall * precision) / (recall + precision)
        print(f'\tF1 = {f1}')  


## 3. TRAINING AND TESTING  

In [20]:
# retrieving the dataloaders
smart_dataloaders2k = load_dataloaders('smart_train_loaders2k.pickle')

In [31]:
# training the model
model = ma2.train(smart_dataloaders2k, 'cuda:0')

	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
Total loss in epoch 0 is 123.02696990966797.
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
Total loss in epoch 1 is 121.75601196289062.
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
Total loss in epoch 2 is 120.65149688720703.
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
Total loss in epoch 3 is 118.6038589477539.
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
Total loss in epoch 4 is 114.04524993896484.
	Still training...
	Still training...
	Still training...
	Still training...
	Still training...
	Still training.

In [34]:
# saving the model
ma2.save_model(model, 'model2k.pickle')

## 4. EVALUATION AND ERROR ANALYSIS  

In [15]:
model = ma2.load_model('model2k.pickle')

In [23]:
test_loaders = load_dataloaders('smart_test_loaders2k.pickle')

In [32]:
all_predictions, all_classes = ma2.test_model(model, test_loaders, 'cpu:0')

Testing complete!


In [33]:
ma2.measures(all_predictions, all_classes)

The following measures have been recorded for this model:
	Accuracy = 0.5433333333333333
	Recall = 0.8115942028985508
	Precision = 0.5022421524663677
	F1 = 0.6204986149584487


In [35]:
model_df = pd.DataFrame({'predictions': all_predictions, 'classes': all_classes})

In [49]:
tp_preds = []
for ind in model_df.index:
    entry = []
    if model_df['predictions'][ind] == 1:
        if smart_test_dataset2k.data[2][ind] == 1:
            pred = model_df['predictions'][ind]
            cap = smart_test_dataset2k.data[1][ind]
            true = smart_test_dataset2k.data[2][ind]
            tp_preds.append([cap, pred, true])
        
tp_df = pd.DataFrame(tp_preds)
tp_df.columns = ["caption", "predicted class", "true class"]

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_colwidth', None)
display(tp_df)

Unnamed: 0,caption,predicted class,true class
0,A chocolate cupcake with a smiling giraffe face.,1,1
1,Two people in a field playing with a white frisbee.,1,1
2,A lady looks blurry while standing in her kitchen.,1,1
3,a photo of a fire hydrant on a sidewalk next to a road,1,1
4,A boy outside doing tricks with his skateboard.,1,1
5,a person standing near a desk with a laptop and a printer,1,1
6,People watching a skater doing tricks at the park.,1,1
7,A kitchen scene with focus on the oven.,1,1
8,The boy is skate boarding up the ramp.,1,1
9,The two traffic lights above the street are green.,1,1


In [51]:
fp_preds = []
for ind in model_df.index:
    entry = []
    if model_df['predictions'][ind] == 1:
        if smart_test_dataset2k.data[2][ind] == 0:
            pred = model_df['predictions'][ind]
            cap = smart_test_dataset2k.data[1][ind]
            true = smart_test_dataset2k.data[2][ind]
            fp_preds.append([cap, pred, true])
        
fp_df = pd.DataFrame(fp_preds)
fp_df.columns = ["caption", "predicted class", "true class"]

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_colwidth', None)
display(fp_df)

Unnamed: 0,caption,predicted class,true class
0,The group of children are posing around a baby elephant.,1,0
1,a blue plate that has some food on it,1,0
2,A LOT OF VEGETABLES AND EGGS ARE OUT ON THE TABLE,1,0
3,a man on a surfboard in a river near a couple of trees and branches,1,0
4,An eighteen wheeler truck carrying logs on its trailer.,1,0
5,three red chairs sitting around a table with a checker board on the top of it,1,0
6,A hand holding a smart phone with 20 new messages.,1,0
7,A city street with lots of blurry traffic on top of it.,1,0
8,A woman preparing to hit a tennis ball while a man watches.,1,0
9,A man surfing along a small wave in the ocean.,1,0


In [52]:
tn_preds = []
for ind in model_df.index:
    entry = []
    if model_df['predictions'][ind] == 0:
        if smart_test_dataset2k.data[2][ind] == 0:
            pred = model_df['predictions'][ind]
            cap = smart_test_dataset2k.data[1][ind]
            true = smart_test_dataset2k.data[2][ind]
            tn_preds.append([cap, pred, true])
        
tn_df = pd.DataFrame(tn_preds)
tn_df.columns = ["caption", "predicted class", "true class"]

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_colwidth', None)
display(tn_df)

Unnamed: 0,caption,predicted class,true class
0,A teddy bear placed on a table with snow,0,0
1,A colorful blue bird perched on top of a persons arms.,0,0
2,A large clock tower stands tall with a gold lined clock.,0,0
3,"Table of food including potatoes, apples, squash, snap peas, and tomatoes",0,0
4,A bed that is next to a desk.,0,0
5,A small kitten is lying on the seat of a large motorcycle.,0,0
6,A girl is sitting down eating a bread stick and pizza.,0,0
7,A giraffe stands beside a tree trunk on a grassy plain.,0,0
8,"A bathroom with a sink, toilet and large mirror.",0,0
9,A lone skier on a very big snow covered hill.,0,0


In [53]:
fn_preds = []
for ind in model_df.index:
    entry = []
    if model_df['predictions'][ind] == 0:
        if smart_test_dataset2k.data[2][ind] == 1:
            pred = model_df['predictions'][ind]
            cap = smart_test_dataset2k.data[1][ind]
            true = smart_test_dataset2k.data[2][ind]
            fn_preds.append([cap, pred, true])
        
fn_df = pd.DataFrame(fn_preds)
fn_df.columns = ["caption", "predicted class", "true class"]

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_colwidth', None)
display(fn_df)

Unnamed: 0,caption,predicted class,true class
0,The woman is holding the dog between her legs.,0,1
1,A white plate with an art deco design around the edge is holding plenty of food and a fork.,0,1
2,a man is swinging a baseball bat outside,0,1
3,A man wearing a tie stares across a the room.,0,1
4,A herd of giraffe walking across a lush green field.,0,1
5,a street light on a street next to a tree lined median.,0,1
6,Barber shop sign next to a red and black fire hydrant.,0,1
7,A glass bowl with an upside bundle of broccoli.,0,1
8,Zebra standing next to each other grazing on a lush green field.,0,1
9,Seagulls on old pier struts poking out of the water,0,1


## BONUS: NEGATIVE SAMPLING  

For this please consult the create_samples() function and the Markdown file accompanying this submission.

## EXTRA PART: PIPELINE TESTING  
Since the trained model is so large that I cannot share it on GitHub, here is a quick way for you to test the pipeline with a small amount of data, just to see that it works.  
#### Preparatory steps:  
+ Make sure that the COCO objects and the BERT model in Part 1 are active.
+ Make sure that the definitions for all the functions defined in the sections earlier have been run.
+ Technically you can also just run the notebook above, but that includes larger data chunks (2k samples, 20 epochs).

In [15]:
# accessing the samples from the COCO objects
small_samples = create_samples('', coco, coco_caps, smart=True)  # set smart to False if you want
# generating data splits with the total size of 100 (so that it is fast)
small_train_samples, small_val_samples, small_test_samples = samples_splits(small_samples[1], small_samples[0], 100)
# creating the datasets
small_train_dataset = CaptionsDataset(small_train_samples)
small_test_dataset = CaptionsDataset(small_test_samples)
# generating the dataloaders; the function would also have saved them, not needed here
# the train loaders will be prepared for 3 epochs
# you can change the first device (cpu), but not the other one; that is the one BERT is on
small_dataloaders = []
for i in range(0, 3):  # 3 = epochs
    loader = captions_dataloader(small_train_dataset, tokenizer, datadir, 'cpu', 'cuda:3', batch_size=8, shuffle=True)
    small_dataloaders.append(loader)
small_test_loaders = []
for i in range(0, 1):  # 1 = epochs
    loader = captions_dataloader(small_test_dataset, tokenizer, datadir, 'cpu', 'cuda:3', batch_size=8, shuffle=False)
    small_test_loaders.append(loader)

In [17]:
# training the model (feel free to change the device)
small_model = ma2.train(small_dataloaders, 'cuda:1')

	Still training...
Total loss in epoch 0 is 6.612432956695557.
	Still training...
Total loss in epoch 1 is 5.892775058746338.
	Still training...
Total loss in epoch 2 is 5.499391078948975.


In [21]:
# part of the testing and evaluation (again, feel free to choose a different device)
all_predictions, all_classes = ma2.test_model(small_model, small_test_loaders, 'cpu')
ma2.measures(all_predictions, all_classes)

Testing complete!
The following measures have been recorded for this model:
	Accuracy = 0.4666666666666667
	Recall = 1.0
	Precision = 0.4666666666666667
	F1 = 0.6363636363636364
