##  CS6910: Fundamentals of Deep Learning
#### Assignment 4- Team 3

S Renganathan, CH16B058	     
S Nithya, CH16B113		         
Vasistha Singhal, CH16B119

In [1]:
# # Run onluy the first time!
# # from pyunpack import Archive
# # Archive('Images.zip').extractall('Assignment4_Images')
# !pip install torchtext==0.6

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import tarfile
import pandas as pd
import os
import re
import pdb
import pickle
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from io import StringIO
from PIL import Image
from torchtext.vocab import GloVe
from torchtext.data import Field, get_tokenizer
import torchvision.models as models
from torchtext.data.metrics import bleu_score
import torchtext

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
class DatasetClass(Dataset):
    
    def __init__(self, folder, image_list, captions):
        
        self.folder = folder
        self.captions = captions
        self.size = 5*len(image_list)
        self.image_list = []
        for image in image_list:
            all_images = [image + '#' + str(i) for i in range(5)]
            self.image_list.extend(all_images)
        
    def __getitem__(self, idx):     
        
        image_name = self.image_list[idx]
        caption = self.captions.loc[image_name, 'Caption']
        img = Image.open(self.folder + image_name[:-2])
        trans = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()])
#         img = Image.open(self.folder + image_name[:-2]).resize((227, 227))
#         trans = transforms.ToTensor()
        return trans(img), caption
      
    def __len__(self):
        
        return self.size

In [5]:
def train_test_loader(directory, image_list, captions, train_fraction=0.8, num_workers=0, batch_size=32):

    dataset = DatasetClass(directory, image_list, captions)
    
    N = dataset.size
    train_size = int(N*train_fraction)
    test_size = N - train_size

    train_data, test_data = torch.utils.data.random_split(dataset, [train_size, test_size])

    trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    return trainloader, testloader, train_size, test_size

def test_loader(directory, image_list, captions, num_workers=0, batch_size=32):

    dataset = DatasetClass(directory, image_list, captions)
    test_size = dataset.size
    testloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    return testloader, test_size

In [6]:
# with open('/content/drive/My Drive/captions.txt') as f:
#     captions = pd.read_csv(StringIO(f.read()), sep='\t', header=None, names=['Image', 'Caption']).set_index('Image', drop=True)
    
# with open('/content/drive/My Drive/image_names.txt') as f:
#     names = list(map(lambda x: x.rstrip(), f.readlines()))

################### change link accordingly!

with open('D:/_SEM8/DL/Assignment 4/captions.txt') as f:
    captions = pd.read_csv(StringIO(f.read()), sep='\t', header=None, names=['Image', 'Caption']).set_index('Image', drop=True)
    
with open('D:/_SEM8/DL/Assignment 4/image_names.txt') as f:
    names = list(map(lambda x: x.rstrip(), f.readlines()))

In [7]:
# trainloader, testloader, train_size, test_size = train_test_loader(directory='/content/drive/My Drive/Assignment4_Data/Images/', image_list=names, captions=captions)

testloader, test_size = test_loader(directory='D:/_SEM8/DL/Assignment 4/Assignment4_Images/Images/', image_list=names, captions=captions)

In [8]:
test_size

20000

#### GloVe Representation

Available GloVe Representations: 

1. glove.42B.300d 
2. glove.840B.300d 
3. glove.twitter.27B.25d 
4. glove.twitter.27B.50d 
5. glove.twitter.27B.100d 
6. glove.twitter.27B.200d 
7. glove.6B.50d 
8. glove.6B.100d 
9. glove.6B.200d 
10. glove.6B.300d

In [9]:
embedding_glove = GloVe(name='6B', dim=100)

In [10]:
punct = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
text_field = Field(tokenize='basic_english', lower=True, eos_token='eos', init_token='sos', stop_words=punct)
preprocessed_text = captions['Caption'].apply(lambda x: text_field.preprocess(x))

text_field.build_vocab(preprocessed_text, vectors=embedding_glove)
embedding_trained = text_field.vocab.vectors[2:, :]
vocab_tokens = np.array(text_field.vocab.itos)[2:]

### Encoder 

In [12]:
alexnet = models.alexnet(pretrained=True)
params_alex = list(alexnet.parameters())

In [16]:
RGB_mean = torch.zeros(3)
i = 0
for X, y in testloader:
    i += 1
    RGB_mean += (X.sum(0).sum(1).sum(1)/(X.shape[2]*X.shape[2]))/test_size
    print(i, '/', len(testloader), end=', ')

1 / 625, 2 / 625, 3 / 625, 4 / 625, 5 / 625, 6 / 625, 7 / 625, 8 / 625, 9 / 625, 10 / 625, 11 / 625, 12 / 625, 13 / 625, 14 / 625, 15 / 625, 16 / 625, 17 / 625, 18 / 625, 19 / 625, 20 / 625, 21 / 625, 22 / 625, 23 / 625, 24 / 625, 25 / 625, 26 / 625, 27 / 625, 28 / 625, 29 / 625, 30 / 625, 31 / 625, 32 / 625, 33 / 625, 34 / 625, 35 / 625, 36 / 625, 37 / 625, 38 / 625, 39 / 625, 40 / 625, 41 / 625, 42 / 625, 43 / 625, 44 / 625, 45 / 625, 46 / 625, 47 / 625, 48 / 625, 49 / 625, 50 / 625, 51 / 625, 52 / 625, 53 / 625, 54 / 625, 55 / 625, 56 / 625, 57 / 625, 58 / 625, 59 / 625, 60 / 625, 61 / 625, 62 / 625, 63 / 625, 64 / 625, 65 / 625, 66 / 625, 67 / 625, 68 / 625, 69 / 625, 70 / 625, 71 / 625, 72 / 625, 73 / 625, 74 / 625, 75 / 625, 76 / 625, 77 / 625, 78 / 625, 79 / 625, 80 / 625, 81 / 625, 82 / 625, 83 / 625, 84 / 625, 85 / 625, 86 / 625, 87 / 625, 88 / 625, 89 / 625, 90 / 625, 91 / 625, 92 / 625, 93 / 625, 94 / 625, 95 / 625, 96 / 625, 97 / 625, 98 / 625, 99 / 625, 100 / 625, 101 / 62

In [17]:
class NetVLAD(nn.Module):
    
    def __init__(self, k, RGB_mean):
        super(NetVLAD, self).__init__()
        
        self.RGB_mean = RGB_mean.to(device)
        
        # CNN
        self.c1 = nn.Conv2d(3, 64, 11, stride=4, padding = (2,2))
        self.mp1 = nn.MaxPool2d(3, stride=2)
        self.c2 = nn.Conv2d(64, 192, 5, stride=1, padding=(2,2))
        self.mp2 = nn.MaxPool2d(3, stride=2)
        self.c3 = nn.Conv2d(192, 384, 3, stride=1, padding=(1,1))
        self.c4 = nn.Conv2d(384, 256, 3, stride=1, padding=(1,1))
        self.c5 = nn.Conv2d(256, 256, 3, stride=1, padding=(1,1))
        self.mp3 = nn.MaxPool2d(3, stride=2)
        
        # NetVLAD
        self.K = k
        self.nv_conv = nn.Conv2d(256, k, 1)
        self.nv_soft_ass = nn.Softmax2d()

        # NetVLAD Parameter
        self.c = nn.Parameter(torch.rand(self.K, 256))
        
        # Flatten to get h
        self.flat = nn.Flatten(1, -1)

    def forward(self, x):
        
        # CNN
        x = self.mp1(F.relu(self.c1(x)))
        x = self.mp2(F.relu(self.c2(x)))
        x = F.relu(self.c5(F.relu(self.c4(F.relu(self.c3(x))))))
        x = self.mp3(x)

        self.z_pre = x.flatten(2, 3)

        # NetVLAD Step 1
        a = self.nv_soft_ass(self.nv_conv(x))

        # NetVLAD Step 2
        for k in range(self.K):
            a_k = a[:, k, :, :]
            c_k = self.c[k, :]
            temp = (x - c_k.reshape(1, -1, 1, 1))*a_k.unsqueeze(1)
            z_k = torch.sum(temp, axis=(2, 3))
            if k==0:
                Z = z_k.unsqueeze(1)
            else:
                Z = torch.cat((Z, z_k.unsqueeze(1)), 1)
        
        # Flatten
        Z = self.flat(Z)

        return Z

### Decoder

In [18]:
class RNN(nn.Module):

    def __init__(self, embed_size, hidden_size, output_size, embedding_pre_trained):
        
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        
        # Pre-trained Word Embedding of all the words is used
        self.embedding = nn.Embedding.from_pretrained(embedding_pre_trained)
        
        # Input to the RNN is word embeddings of a word
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=hidden_size) 
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)  
        
    def forward(self, input_vec, hidden_vec):
        '''
        Parameters:
        ------------
        input_vec  - tensor of index of word/token. Example: torch.LongTensor([[0]]) for sos_token
        hidden_vec - train_image or output from previous RNN cell
        '''
        embedded_input_vec = self.embedding(input_vec)
        output_vec, hidden_vec = self.rnn(embedded_input_vec, hidden_vec)
        output_vec = self.softmax(self.out(output_vec[0]))
        return output_vec, hidden_vec      


In [19]:
class LSTM(nn.Module):

    def __init__(self, embed_size, hidden_size, output_size, embedding_pre_trained):
        
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        
        # Pre-trained Word Embedding of all the words is used
        self.embedding = nn.Embedding.from_pretrained(embedding_pre_trained)
        
        # Input to the LSTM is word embeddings of a word
        self.lstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_size) 
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)  
        
    def forward(self, input_vec, hidden_vec, cell_state):
        '''
        Parameters:
        ------------
        input_vec  - tensor of index of word/token. Example: torch.LongTensor([[0]]) for sos_token
        hidden_vec - train_image or output from previous RNN cell
        '''
        embedded_input_vec = self.embedding(input_vec)
        output_vec, (hidden_vec, cell_state) = self.lstm(embedded_input_vec, (hidden_vec, cell_state))
        output_vec = self.softmax(self.out(output_vec[0]))
        return output_vec, (hidden_vec, cell_state)     
    
    

In [20]:
def train_one_image(train_image, image_caption, encoder_obj, decoder_obj, encoder_optim, decoder_optim, loss_func, vocab_tokens, is_LSTM=False, print_pred = False):
    '''
    Parameters:
    -----------
    train_image     - Images stored in batches using DataLoader
    image_caption   - Caption of the image (in the same word embedding representation used in )
    '''
    # Start of sentence and End of Sentence token
    try:
        eos_token = np.argwhere(vocab_tokens=='eos').item()
        sos_token = np.argwhere(vocab_tokens=='sos').item()
    except:
        pdb.set_trace()
    
    # Length of the image caption
    caption_length = len(image_caption)

    encoder_optim.zero_grad()
    decoder_optim.zero_grad()

    
    encoder_output = encoder_obj(train_image).view(1, 1, -1)

    decoder_input = torch.tensor([[sos_token]], device=device)  ## This is converted into an embedding withing the decoder class
    decoder_hidden = encoder_output
    decoder_hidden_size = decoder_hidden.shape[-1]
    if is_LSTM:
        decoder_cell_state = torch.zeros(decoder_hidden_size).view(1, 1, -1).to(device) 

    loss = 0
    output_caption = []
    for i in range(caption_length):
        if is_LSTM:
            decoder_output, (decoder_hidden, decoder_cell_state) = decoder_obj(decoder_input, decoder_hidden, decoder_cell_state)
        else:    
            decoder_output, decoder_hidden = decoder_obj(decoder_input, decoder_hidden) 
        max_val, max_ind = decoder_output.topk(1)  # Choosing the word with maximum probability 
        decoder_input = max_ind  #.squeeze().detach()  
        output_caption.append(vocab_tokens[decoder_input.item()])
        loss += loss_func(decoder_output, torch.tensor([np.argwhere(vocab_tokens == image_caption[i]).item()]).to(device))
        if decoder_input.item() == eos_token:
            break
            
    if print_pred:
        print('\n')
        print('Target: ', image_caption)
        print('Predicted :', output_caption)
    
    loss.backward()
    encoder_optim.step()
    decoder_optim.step()

    return loss.item()/caption_length
    

In [27]:
def create_checkpoint(ckp_state, ckp_path):
    torch.save(ckp_state, ckp_path)
    print('\nCreated Checkpoint!')
    
def load_checkpoint(ckp_path, model, optimizer = None, load_optimizer_state = False):
    ckp = torch.load(ckp_path)
    model.load_state_dict(ckp['state_dict'])
    if load_optimizer_state:
        optimizer.load_state_dict(ckp['optimizer'])
        return model, optimizer, ckp['epoch'], ckp['loss']
    else:
        return model, ckp['epoch'], ckp['loss']

In [22]:
def train(lr, trainloader_images, train_images_size, encoder_obj, decoder_obj, vocab_tokens, params, is_LSTM=False):
    
    loss_func = nn.NLLLoss()

    tokenizer = get_tokenizer("basic_english")
    punct = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
    
    encoder_optim = optim.SGD(encoder_obj.parameters(), lr=lr)
    decoder_optim = optim.SGD(decoder_obj.parameters(), lr=lr)
    
    old_loss = np.inf
    epoch = 0
    losses = []
    print_pred = True
    while True:
        new_loss = 0
        epoch += 1
        batch = 0 
        for data in trainloader_images:
            batch += 1
            images, captions = data[0].to(device), data[1]
            bs = images.shape[0]

            for i in range(bs):
                train_image = images[i].view(1, images.shape[1], images.shape[2], images.shape[3])
                image_caption = tokenizer(captions[i])
                image_caption = list(filter(lambda x: x not in punct, image_caption))
                image_caption.append('eos')
                new_loss += train_one_image(train_image, image_caption, encoder_obj, decoder_obj, encoder_optim, decoder_optim, loss_func, vocab_tokens, is_LSTM, print_pred)
                        
        new_loss = new_loss/train_images_size
        
        print('\n')
        print('-----------------------------------------------------------------------------------------------')
        print('Epoch {0}: Loss = {1}, Rel loss = {2}'.format(epoch, new_loss, (old_loss-new_loss)/new_loss))
        
        losses.append(new_loss)
        
        if epoch%5 == 0:
            print_pred = True
            pickle.dump(losses, open('checkpoints/losses.sav', 'wb'))
            checkpoint_encoder = {'epoch': epoch, 'loss': new_loss, 'state_dict': encoder_obj.state_dict(), 'optimizer': encoder_optim.state_dict()}
            checkpoint_decoder = {'epoch': epoch, 'loss': new_loss, 'state_dict': decoder_obj.state_dict(), 'optimizer': decoder_optim.state_dict()}
            create_checkpoint(ckp_state=checkpoint_encoder, ckp_path='checkpoints/encoder{0}.pt'.format(epoch))
            create_checkpoint(ckp_state=checkpoint_decoder, ckp_path='checkpoints/decoder{0}.pt'.format(epoch))          
        else:
            print_pred = False

        if abs(new_loss-old_loss)/new_loss < 1e-5:
            print('Converged')
            return losses

        old_loss = new_loss

In [23]:
def evaluate_one_image(encoder_obj, decoder_obj, image, image_caption, vocab_tokens, is_LSTM=False):
    with torch.no_grad():

        # Start of sentence and End of Sentence token
        sos_token = np.argwhere(vocab_tokens=='sos').item()
        eos_token = np.argwhere(vocab_tokens=='eos').item()

        # Length of the image caption
        caption_length = len(image_caption)

        encoder_output = encoder_obj(image).view(1, 1, -1)  
        
        decoder_input = torch.tensor([[sos_token]], device=device)
        decoder_hidden = encoder_output
        decoder_hidden_size = decoder_hidden.shape[-1]
        if is_LSTM:
            decoder_cell_state = torch.zeros(decoder_hidden_size).view(1, 1, -1).to(device)
            
        output_caption = []
            
        for i in range(caption_length):
            if is_LSTM:
                decoder_output, (decoder_hidden, decoder_cell_state) = decoder_obj(decoder_input, decoder_hidden, decoder_cell_state)
            else:    
                decoder_output, decoder_hidden = decoder_obj(decoder_input, decoder_hidden)             
            max_val, max_ind = decoder_output.topk(1)  # Choosing the word with maximum probability 
            decoder_input = max_ind 
            if decoder_input.item() == eos_token:
                output_caption.append('eos')
                break
            else:
                output_caption.append(vocab_tokens[decoder_input.item()])
                
        print('\n')
        print('Image caption: ', image_caption)
        print('Predicted caption: ', output_caption)

        return output_caption, [image_caption]
    
def evaluate(trainloader_images, train_images_size, encoder_obj, decoder_obj, is_LSTM = False):

    
    punct = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
    tokenizer = get_tokenizer("basic_english")
    
    candidate_corpus = []
    references_corpus = []
    for data in trainloader_images:
        images, captions = data[0].to(device), data[1]
        bs = images.shape[0]
        
        for i in range(bs):
            train_image = images[i].view(1, images.shape[1], images.shape[2], images.shape[3])
            image_caption = tokenizer(captions[i])
            image_caption = list(filter(lambda x: x not in punct, image_caption))
            image_caption.append('eos')
            oc, ic = evaluate_one_image(encoder_obj, decoder_obj, train_image, image_caption, vocab_tokens, is_LSTM)
            candidate_corpus.append(oc)
            references_corpus.append(ic)
    
    bleu1 = bleu_score(candidate_corpus, references_corpus, weights=(1.0, 0, 0, 0))
    bleu2 = bleu_score(candidate_corpus, references_corpus, weights=(0.5, 0.5, 0, 0))
    bleu3 = bleu_score(candidate_corpus, references_corpus, weights=(1/3, 1/3, 1/3, 0))
    bleu4 = bleu_score(candidate_corpus, references_corpus)
    
    bleu_scores = [bleu1, bleu2, bleu3, bleu4]

    return candidate_corpus, references_corpus, bleu_scores

### Question 1

In [24]:
k = 3
encoder_obj = NetVLAD(k, RGB_mean).to(device)
# Initialization with pre-trained parameters
with torch.no_grad():
    encoder_obj.c1.weight = params_alex[0]
    encoder_obj.c1.bias = params_alex[1]

    encoder_obj.c2.weight = params_alex[2]
    encoder_obj.c2.bias = params_alex[3]

    encoder_obj.c3.weight = params_alex[4]
    encoder_obj.c3.bias = params_alex[5]

    encoder_obj.c4.weight = params_alex[6]
    encoder_obj.c4.bias = params_alex[7]

    encoder_obj.c5.weight = params_alex[8]
    encoder_obj.c5.bias = params_alex[9]

encoder_obj = encoder_obj.to(device)


embed_size = embedding_trained.shape[1]
decoder_output_size = embedding_trained.shape[0]
decoder_hidden_size = k*256
decoder_obj = RNN(embed_size, decoder_hidden_size, decoder_output_size, embedding_trained).to(device)

In [29]:
encoder_trained1, e_epoch1, e_loss1 = load_checkpoint("q1_encoder_wts.pt", encoder_obj)
decoder_trained1, d_epoch1, d_loss1 = load_checkpoint("q1_decoder_wts.pt", decoder_obj)

In [None]:
candidate_corpus1, references_corpus1, bleu_scores1 = evaluate(testloader, test_size, encoder_trained1, decoder_trained1)

In [33]:
bleu_scores1

[0.40879117523956954,
 0.23757306996097471,
 0.14627992614805915,
 0.09392114428659279]

### Question 2

In [18]:
k = 3
encoder_obj = NetVLAD(k, RGB_mean).to(device)
# Initialization with pre-trained parameters
with torch.no_grad():
    encoder_obj.c1.weight = params_alex[0]
    encoder_obj.c1.bias = params_alex[1]

    encoder_obj.c2.weight = params_alex[2]
    encoder_obj.c2.bias = params_alex[3]

    encoder_obj.c3.weight = params_alex[4]
    encoder_obj.c3.bias = params_alex[5]

    encoder_obj.c4.weight = params_alex[6]
    encoder_obj.c4.bias = params_alex[7]

    encoder_obj.c5.weight = params_alex[8]
    encoder_obj.c5.bias = params_alex[9]

encoder_obj = encoder_obj.to(device)

embed_size = embedding_trained.shape[1]
decoder_output_size = embedding_trained.shape[0]
decoder_hidden_size = k*256
decoder_obj = LSTM(embed_size, decoder_hidden_size, decoder_output_size, embedding_trained).to(device)

In [29]:
encoder_trained2, e_epoch2, e_loss2 = load_checkpoint("q2_encoder_wts.pt", encoder_obj)
decoder_trained2, d_epoch2, d_loss2 = load_checkpoint("q2_decoder_wts.pt", decoder_obj)

In [None]:
candidate_corpus2, references_corpus2, bleu_scores2 = evaluate(testloader, test_size, encoder_trained2, decoder_trained2)

In [None]:
bleu_scores2