# Baseline Description Generation Model

In this notebook is the implementation of the baseline model for a description generation model. This model is based on an encoder-decoder model, where the encoder is a CNN and the decoder is a LSTM-RNN language model. 

The encoder will be pretrained on a dataset with a relative small number of labels, for the classification task. After the pretraining, both the encoder and decoder are jointly trained for the task of generating descriptions. 

For the current baseline a simple implementation will be used without any form of attention. 

## import packages

In [15]:
# loadbars to track the run/speed
from tqdm import tqdm_notebook, tnrange

# numpy for arrays/matrices/mathematical stuff
import numpy as np
np.set_printoptions(threshold=np.nan) #will print entire matrix without dots...

# nltk for tokenizer
from nltk.tokenize import wordpunct_tokenize   

# torch for the NN stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD

# torch tools for data processing
from torch.utils.data import DataLoader
import pycocotools #cocoAPI

# torchvision for the image dataset and image processing
from torchvision.datasets import CocoCaptions
from torchvision import transforms
from torchvision import models

#coco captions evaluation
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

# packages for plotting
import matplotlib.pyplot as plt
import skimage.io as io
import seaborn
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

# additional stuff
import pickle
from collections import Counter
from collections import defaultdict
import os
from datetime import datetime

import json
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')

SyntaxError: Missing parentheses in call to 'print'. Did you mean print('tokenization...')? (eval.py, line 30)

#### test if device has GPU

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
device

device(type='cpu')

#### Set up for Validation

In [None]:
# set up file names and pathes
annFile='~/annotations/captions_val2014.json'
subtypes=['results', 'evalImgs', 'eval']
# download Stanford models
!./get_stanford_models.sh

## Hyper Parameters

In the code many hyper parameters will be used. For instance the file locations, dimensions for the networks layers, etc.

In [4]:
learning_rate = 1e-1
max_epochs = 30
batch_size = 16

vocab_size = 30000
embedding_size = 2048

save_step = 100

PAD = '<PAD>'
START = '<START>'
END = '<END>'
UNK = '<UNK>'

crop_size = 224
transform = transforms.Compose([ 
            transforms.RandomResizedCrop(crop_size),
            transforms.RandomHorizontalFlip(), 
            transforms.ToTensor(), 
            transforms.Normalize((0.485, 0.456, 0.406), 
                                 (0.229, 0.224, 0.225))])

## Data Processing

First we load the data from the COCO captions Dataset

In [50]:
temp_data = CocoCaptions(root = '/home/victor/coco/images/train2014/',annFile = '/home/victor/coco/annotations/captions_train2014.json', transform=transforms.ToTensor())
train_data = CocoCaptions(root = '/home/victor/coco/images/train2014/',annFile = '/home/victor/coco/annotations/captions_train2014.json', transform=transform)
val_data = CocoCaptions(root = '/home/victor/coco/images/val2014/',annFile = '/home/victor/coco/annotations/captions_val2014.json', transform=transform)

loading annotations into memory...
Done (t=1.70s)
creating index...
index created!
loading annotations into memory...
Done (t=0.87s)
creating index...
index created!
loading annotations into memory...
Done (t=0.75s)
creating index...
index created!


A vocabulary class is created to keep track of words in the dataset.

In [60]:
class DataProcessor():
    def __init__(self, data, vocab_size, filename=None):
        self.vocab_size = vocab_size
        if filename == None:
            filename = 'vocab_'+str(self.vocab_size)+'.pkl'
        self.filename = filename
        if os.path.isfile(self.filename):
            self.vocab, self.vocab_size, self.vocab_weight = self.load()
        else: 
            self.vocab, self.vocab_size, self.vocab_weight = self.build_vocab(data)            
        self.w2i, self.i2w = self.build_dicts()
    
    def build_dicts(self):
        """
        creates lookup tables to find the index given the word 
        and the otherway around 
        """
        w2i = defaultdict(lambda: w2i[UNK])
        i2w = dict()
        for i,w in enumerate(self.vocab):
            i2w[i] = w
            w2i[w] = i
        return w2i, i2w
    
    def build_vocab(self, data): 
        """
        builds a vocabulary with the most occuring words, in addition to
        the UNK token at index 1 and PAD token at index 0. 
        START and END tokens are added to the vocabulary through the
        preprocessed sentences.
        with vocab size none, all existing words in the data are used
        """
        vocab = Counter()
        for item in tqdm_notebook(data):
            for sent in item[1]:
                s = wordpunct_tokenize(sent[0].lower())
                for w in s:
                    vocab[w] += 1

        vocab = [k for k,_ in vocab.most_common(self.vocab_size - 4)] #minus 4 because of the default tokens
        vocab_weights = list(range(len(vocab)))
        vocab = [PAD,UNK,START,END] + vocab # padding needs to be first, because of the math
        vocab_weights = [0.,1.,1.,1.] + vocab_weights
        return vocab,len(vocab), vocab_weights 
    
    def save(self):
        pickle.dump(self.vocab, open(self.filename, 'wb'))
        
    def load(self):
        vocab = pickle.load(open(self.filename, 'rb'))
        vocab_size = len(vocab)
        vocab_weights = [0.,1.,1.,1.] + list(range(len(vocab)))
        return vocab, vocab_size, vocab_weights
        

### function for preparing the batch in correct format

In [52]:
def transform_batch(batch, processor):
    """
    input batch: tuple with the images and a list of tuples of sentences. 
    the lenght of the list is the number of sentences for an image. 
    the length of the tuple is the batch size.
    
    output batch: a tensor with for each image one of the sentences randomly chosen. 
    the first dim is the batchsize. second dim is the sentence length. 
    the sentences are padded with zeros and prefixed and post fixed with the 
    START and END token. The words are transformed to indices. 
    """
    sent_lengths = []
    longest = -1
    images,captions = batch
    trans_images = None
    trans_captions = []
    repeat_size = images[0].size()
#     repeat_size[0] *= 5
    print(repeat_size)
    print(images.size())
    for sample_num in range(len(captions[0])):
        number_of_captions = len(captions)
        if trans_images is None:
            print(images[sample_num].size())
            trans_images = images[sample_num].repeat([5,3,224,224])
            print(trans_images.size())
        else:
            result = torch.cat([trans_images, images[sample_num].repeat(repeat_size)],0)
        for sentnum in range(number_of_captions):
            s = [START] + wordpunct_tokenize(captions[sentnum][sample_num].lower()) + [END]
            l = len(s)
            trans_captions.append(s)
            sent_lengths.append(l)
            if longest < l:
                longest = l

    final_images = np.array(trans_images)
    final_images = torch.from_numpy(final_images).type(torch.LongTensor).to(device)
    final_captions = np.zeros((len(trans_captions), longest))
    for i,s in enumerate(trans_captions):
        final_captions[i,:len(s)] = np.array([processor.w2i[w] for w in s])
    batch = torch.from_numpy(trans_batch).type(torch.LongTensor).to(device)
    sent_lengths = torch.FloatTensor(sent_lengths).to(device)
    return batch, sent_lengths

## Encoder

The encoder is a CNN which first is pretrained on the image classification task. Once pretrained, it will be used for encoding in an vector representation.

This can be extended to deviding the image into a grid, where each gridcell is encoded into a vector. During decoding, an attention can then be used over the grid vectors. 

In [53]:
class EncoderCNN(nn.Module):
    def __init__(self, embedding_size):
        super().__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embedding_size)
        self.batchnorm = nn.BatchNorm1d(embedding_size)
        
    def forward(self, x):
        # the resnet is pretrained, so turn of the gradient
        with torch.no_grad():
            out = self.resnet(x)
        out = out.reshape(out.size(0), -1)
        out = self.linear(out)
        out = self.batchnorm(out)
        return out

## Decoder

The decoder is a LSTM-RNN which for each timestep generates a single word. In the first step, the hidden layer is initialized with the encoded vector. 

In [54]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_size, embedding_size):
        super().__init__()
        
        self.embedding_size = embedding_size
    
        self.target_embeddings = nn.Embedding(target_vocab_size, embedding_size)
        self.LSTM = nn.LSTM(embedding_size, embedding_size)
        self.logit_lin = nn.Linear(embedding_size, target_vocab_size) # out
        
    def forward(self, input_words, hidden_input):  
        # find the embedding of the correct word to be predicted
        emb = self.target_embeddings(input_words)
        # reshape to the correct order for the LSTM
        emb = emb.view(1,emb.size(0),self.embedding_size)
        # Put through the next LSTM step
        lstm_output, hidden = self.LSTM(emb, hidden_input)
        output = self.logit_lin(lstm_output)

        return output, hidden

## Encoder-Decoder

A single model is created to tie both the networks together

In [55]:
class CaptionModel(nn.Module):
    def __init__(self, 
                 embedding_size,
                 target_vocab_size,
                 device):
        
        super().__init__()
        self.target_vocab_size = target_vocab_size
        
        self.encoder = EncoderCNN(embedding_size).to(device)
        self.decoder = Decoder(target_vocab_size,embedding_size).to(device)

        self.loss = nn.CrossEntropyLoss(ignore_index=0, reduce=False).to(device)

    def forward(self,images, captions, caption_lengths):        
        # Encode
        h0 = self.encoder(images)
        
        #prepare decoder initial hidden state
        h0 = h0.unsqueeze(0)
        c0 = torch.zeros(h0.shape)
        hidden_state = (h0,c0)
        
        # Decode
        batch_size, max_sent_len = captions.shape
        out = torch.zeros((batch_size))  
        for w_idx in range(max_sent_len-1):
            prediction, hidden_state = self.decoder(captions[:,w_idx].view(-1,1), hidden_state)
            out += self.loss(prediction.squeeze(0), captions[:,w_idx+1])
        
        #normalize loss
        out = torch.mean(torch.div(out,caption_lengths))  # the loss is the average of losses, so divide over number of words in each sentence
        
        return out

## Setup Network

the model is initialised and the optimizer for the model is set. 

In [56]:
caption_model = CaptionModel(embedding_size, vocab_size, device)
caption_model.train(True) #probably not needed. better to be safe
opt = SGD(caption_model.parameters(), lr=learning_rate)

An dataprocessor is created. If a pickle with the given vocabsize already exists, it is loaded, otherwise a new one is created. 

In [61]:
# setup dataloaders with train and val data
temploader = DataLoader(dataset=temp_data, batch_size=1, shuffle=False, drop_last=False, num_workers=1)
processor = DataProcessor(data=temploader, vocab_size=vocab_size)
processor.save()
del(temploader)
del(temp_data)


Dataloader for processing the data for both the training and validation data are loaded. 

In [62]:
trainloader = DataLoader(dataset=train_data, batch_size=13, shuffle=True, drop_last=True, num_workers=4)
valloader = DataLoader(dataset=val_data, batch_size=1, shuffle=True, drop_last=True, num_workers=4)

In [63]:
for b in trainloader:
    break
transform_batch(b,processor)

torch.Size([3, 224, 224])
torch.Size([13, 3, 224, 224])
torch.Size([3, 224, 224])


RuntimeError: $ Torch: not enough memory: you tried to allocate 422GB. Buy new RAM! at /pytorch/aten/src/TH/THGeneral.c:218

## validation for after every epoch

In [None]:
def validation_predictions(model, prediction_file_name):
    # Use submodules for prediction
    encoder = model.encoder
    decoder = model.decoder

    predicted_sentences = []

    for s,ts in zip(source_processor_val.sentences,target_processor_val.sentences):
        mask = torch.from_numpy(np.ones([1,len(s)])).type(torch.FloatTensor)
        words_tokens = torch.LongTensor([source_processor.w2i[w] for w in s])
        words_tokens_tar = torch.LongTensor([target_processor.w2i[w] for w in ts])
        pos_tokens = torch.LongTensor([i for i in range(len(s))])
        len_s = torch.FloatTensor([len(s)])
        
        # Encode
        all_embs, mean_emb = encoder(words_tokens.view(1, len(s)),
                                     pos_tokens.view(1, len(s)), 
                                     len_s,
                                     mask)
        del(mask)
        del(len_s)
        del(pos_tokens)
        del(words_tokens)

        # Decode
        start_token = torch.LongTensor([target_processor.w2i[START]])
        predicted_words = []
        
        prediction = start_token.view(1,1)
        hidden_state_batch = mean_emb

        hidden_state_batch = hidden_state_batch.unsqueeze(0)
        for w_idx in range(target_processor.max_sentence_length):# loop until EOS is produced or a max is reached (max_sentence_length)
            prediction, hidden_state_batch,_ = decoder(prediction, # the previous prediction
                                                       hidden_state_batch,
                                                       all_embs,
                                                       run_gpu=run_gpu)

            index_predicted_word = np.argmax(prediction.detach().numpy(), axis=2)[0][0]
            predicted_word = target_processor.i2w[index_predicted_word]
            predicted_words.append(predicted_word)

            if predicted_word == END:
                break
            
            prediction = torch.LongTensor([index_predicted_word]).view(1,1)
        
        predicted_sentences.append(predicted_words)

        del(start_token)
        del(mean_emb)
        del(hidden_state_batch)
        del(all_embs)
        del(prediction)
    
    with open(prediction_file_name, 'w', encoding='utf-8') as f:
        for p in predicted_sentences:
            if p[-1] == END:
                p = p[:-1]
            f.write(' '.join(p) + '\n')
    
    # execute a powershell script for removing tokens
    prediction_restored = prediction_file_name[:-5] + '_restored.pred'
    _ = %ps get-content {prediction_file_name} | %{{$$_ -replace "(@@ )|(@@ ?$)",""}} | out-file {prediction_restored} -encoding Ascii
    perl_script = subprocess.Popen(["C:/Strawberry/perl/bin/perl.exe", 
                                    "./tools/mosesdecoder/scripts/generic/multi-bleu.perl", 
                                    "-lc", 
                                    "./data/val/val_tokenized_lowercased.en",
                                    "<",prediction_restored],
                                   shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    bleu_out,bleu_err = perl_script.communicate()
    run_gpu=True
    return bleu_out.decode("utf-8"), bleu_err.decode("utf-8")

## Train

In [107]:
losses = []

opt.zero_grad()

#loop over number of epochs
for it in tnrange(1):
    batch_losses = []
    #loop over all the training batches
    for i_batch, batch in tqdm_notebook(enumerate(trainloader), total=len(trainloader),leave=False):
        image, caption, caption_lengths = transform_batch(batch, processor)
        image = image.to(device)
        loss = caption_model(image, caption, caption_lengths)
        loss.backward()
        batch_losses.append(float(loss))
        opt.step()
    losses += batch_losses
    #create validation result file
    caption_model.train(False)
    #perform validation
    timestamp = datetime.now()
    prediction_file_name = 'val_epoch_{}_baseline_t_{:%m_%d_%H_%M}.pred'.format(it, timestamp)
    blue,_ = validation_predictions(encdec,prediction_file_name)
    print("Pseudo-Epoch {}:\t{}".format(pseudo_epoch,blue.strip()))
    # Dump trained models
    torch.save(encdec.state_dict(), 'encmean_model_it_{}_t_{:%m_%d_%H_%M}.torchsave'.format(it, timestamp))
    if run_gpu:
        encdec = encdec.cuda()
    encdec.train(True)
    
    ## perform the coco evaluation provided by COCO. 
    # create coco object and cocoRes object# creat 
    algName = 'baseline_epoch%d'%it
    [resFile, evalImgsFile, evalFile] = ['./results/captions_%s_%s_%s.json'%(dataType,algName,subtype) for subtype in subtypes]
    coco = COCO(annFile)
    cocoRes = coco.loadRes(resFile)
    

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=41391), HTML(value='')))

#### save the trained model

In [None]:
# # Dump trained models
# timestamp = datetime.now()
# last_model_file_name = 'encmean_model_last-it_{}_t_{:%m_%d_%H_%M}.torchsave'.format(it, timestamp)
# torch.save(encdec.state_dict(), last_model_file_name)

# print('Model saved in file: {}'.format(last_model_file_name))