In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision.models as models
import os
import pickle


## CNN Encoder
It encodes the input image into a 1D feature vector. The CNN model uses a pretrained xgg16 model along with a fully connected linear layer at it's end to generate this feature vector. For the xgg16 model we use only the first two layers and omiited the last layer. 

In [None]:
class EncoderCNN(nn.Module):
    def convs(self,x):
            x = self.pretrainedCNN(x)
            self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
            # print(self._to_linear)
            return x

    def forward(self,x):
        x = self.convs(x)
        x = x.view(-1,self._to_linear)
        x = F.relu(self.fc1(x))
        return x

    def __init__(self):
        super().__init__()
        vgg16 = models.vgg16(pretrained = True)
        layers = list(vgg16.children())[:-1]
        self.pretrainedCNN = nn.Sequential(*layers)
        self._to_linear = None
        x = torch.randn(3,224,224).view(-1,3,224,224)
        self.convs(x)
        # print(self._to_linear)
        self.fc1 = nn.Linear(self._to_linear,512)
encoder = EncoderCNN()

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




# RNN Decoder
For the decoder part I used the same model which was used in the previous part.

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self,embed_size,hidden_size,vocab_size,num_layers = 2):
        super().__init__()
        self.embeding_layer = nn.Embedding(vocab_size,embed_size)
        self.lstm = nn.LSTM(input_size = embed_size,hidden_size = hidden_size,num_layers = num_layers,batch_first = True)
        self.linear = nn.Linear(hidden_size,vocab_size)
    
    def forward(self,features,captions):
        embed = self.embeding_layer(captions[:,:-1])
        # print(features.unsqueeze(1).shape,embed.shape)
        embed = torch.cat((features.unsqueeze(1),embed), dim = 1)
        lstm_outputs, _ = self.lstm(embed)
        out  = self.linear(lstm_outputs)
        return out


### Vocabulary:
This class generates a vocabulary consisting of all the words used more than 10 times all over the training set and then assign an index to each of the words.

In [None]:
import nltk
nltk.download('punkt')
from collections import Counter
import numpy as np
from tqdm import tqdm
import csv
import string

class Vocabulary():
        
    def load_vocab(self,file_path,vocab_file):
        words = []
        
        # nltk.download('punkt')
        with open(file_path) as file:
            rows = csv.reader(file, delimiter="\t")
            for row in tqdm(rows):
                for i in range(5):
                    word_tokenized_list = nltk.tokenize.word_tokenize(row[i+1])
                    word_tokenized_no_punct = [x for x in word_tokenized_list if x not in string.punctuation]
                    words.extend(word_tokenized_no_punct)
        words_count = Counter(words)
        words_thresholded = [x for x in words_count if words_count[x]>10]
        words_thresholded.append(self.start_word)
        words_thresholded.append(self.end_word)
        words_thresholded.sort()
        # print(len(words_thresholded))
        vocab = {}
        for i, word in tqdm(enumerate(words_thresholded)):
            vocab.update({
                word: i
            })
        pickle.dump(vocab,open(vocab_file,'wb'))
        return vocab
    def __init__(self,train_captions = '/content/drive/My Drive/Assignment4/train_captions.tsv',
                 word_count_threshold = 5, vocab_file = '/content/drive/My Drive/Assignment4/vocab.pk'):
        self.vocab = None
        self.start_word = '<start>'
        self.end_word = '<end>'
        if os.path.exists(vocab_file):
            self.vocab = pickle.load(open(vocab_file,'rb'))
            print("loading from logs")
        else:
            self.vocab = self.load_vocab(train_captions,vocab_file)
        print("generating vocabulary")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Vectorizer
Given a token this function turns it into a vector where each element of the vector is the index of the position of the word in the dictionary.

In [None]:
def word2vec(vocab,caption,maxl):
    tokens = []
    tokens.append(vocab.start_word)
    caption  = nltk.tokenize.word_tokenize(caption)
    caption = [x for x in caption if x not in string.punctuation]
    tokens.extend(caption)
    tokens.append(vocab.end_word)
    out = []
    for i,word in enumerate(tokens):
        if word in vocab.vocab.keys():
            out.append(vocab.vocab[word])
    while len(out)<maxl+2:
        out.append(vocab.vocab[vocab.end_word])
    out = torch.Tensor(out).to(device)
    return out 

## Processing
Preprocessing the image so that any variable size image can be accepted and then normalizing the image and converting it into a tensor.

In [None]:
from PIL import Image

def transform(input_image_path):
    transform  = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])])
    image = Image.open(input_image_path)
    out_image = transform(image)
    return out_image

### This function loads all the captions and indices from the related file

In [None]:
import csv
def load_captions(captions_path = '/content/drive/My Drive/Assignment4/train_captions.tsv'):
    captions = {}
    indices = []
    with open(captions_path) as file:
        rows = csv.reader(file,delimiter = "\t")
        for row in tqdm(rows):
            captions[row[0]] = row[1:]
            indices.append(row[0])
    indices = np.array(indices)
    return indices,captions

In [None]:
embedding_size = 512
hidden_size = 512
num_epochs = 5
batch_size = 100
vocab  = Vocabulary()
vocab_size = len(vocab.vocab)
encoder = EncoderCNN()
decoder = DecoderRNN(embedding_size,hidden_size,vocab_size)
indices,captions = load_captions()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

loading from logs
generating vocabulary


29000it [00:06, 4806.39it/s]


DecoderRNN(
  (embeding_layer): Embedding(5217, 512)
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True)
  (linear): Linear(in_features=512, out_features=5217, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

params = list(decoder.parameters()) + list(encoder.fc1.parameters())

optimizer = torch.optim.Adam(params = params, lr = 0.002)

In [None]:
num_batches = int(len(indices)/batch_size)

for i in tqdm(range(num_epochs)):
    for k in tqdm(range(num_batches)):
        train_indices = indices[k*batch_size:(k+1)*batch_size]
        images = []
        for image in train_indices:
            images.append(transform(input_image_path='/content/drive/My Drive/Assignment4/train_images/train_images/image_'+image+'.jpg'))
        images = torch.stack(images)
        images = images.to(device)
        for j in range(5):
            word_vec = []
            maxl = 0
            for index in train_indices:
                caption = nltk.tokenize.word_tokenize(captions[index][j])
                maxl = max(maxl,len(caption))
            # print(maxl)
            for index in train_indices:
                caption = captions[index][j]
                word_vec.append(word2vec(vocab,caption,maxl).long())
            word_vec = torch.stack(word_vec)
            word_vec.to(device)
            
            decoder.zero_grad()
            encoder.zero_grad()

            features = encoder(images)
            outputs = decoder(features, word_vec)

            
            loss = criterion(outputs.view(-1, vocab_size), word_vec.view(-1))
        
        # Backward pass.
            loss.backward()
        
        # Update the parameters in the optimizer.
            optimizer.step()
    
    print(loss.item())
        

  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/290 [00:00<?, ?it/s][A
  0%|          | 1/290 [00:50<4:02:44, 50.40s/it][A
  1%|          | 2/290 [01:37<3:57:29, 49.48s/it][A
  1%|          | 3/290 [02:23<3:51:57, 48.49s/it][A
  1%|▏         | 4/290 [03:15<3:55:23, 49.38s/it][A
  2%|▏         | 5/290 [04:01<3:49:24, 48.30s/it][A
  2%|▏         | 6/290 [04:52<3:52:50, 49.19s/it][A
  2%|▏         | 7/290 [05:37<3:46:20, 47.99s/it][A
  3%|▎         | 8/290 [06:32<3:55:33, 50.12s/it][A
  3%|▎         | 9/290 [07:17<3:46:58, 48.47s/it][A
  3%|▎         | 10/290 [08:05<3:45:49, 48.39s/it][A
  4%|▍         | 11/290 [08:53<3:44:05, 48.19s/it][A
  4%|▍         | 12/290 [09:40<3:41:56, 47.90s/it][A
  4%|▍         | 13/290 [10:28<3:41:05, 47.89s/it][A
  5%|▍         | 14/290 [11:16<3:40:21, 47.90s/it][A
  5%|▌         | 15/290 [12:02<3:37:15, 47.40s/it][A
  6%|▌         | 16/290 [12:50<3:37:51, 47.71s/it][A
  6%|▌         | 17/290 [13:39<3:37:41, 47.84s/it][A
  6%|▌    

1.5117665529251099



  0%|          | 1/290 [00:06<33:36,  6.98s/it][A
  1%|          | 2/290 [00:14<33:40,  7.02s/it][A
  1%|          | 3/290 [00:21<33:44,  7.05s/it][A
  1%|▏         | 4/290 [00:28<33:47,  7.09s/it][A
  2%|▏         | 5/290 [00:35<33:49,  7.12s/it][A
  2%|▏         | 6/290 [00:42<33:52,  7.16s/it][A
  2%|▏         | 7/290 [00:50<33:53,  7.18s/it][A
  3%|▎         | 8/290 [00:57<33:51,  7.20s/it][A
  3%|▎         | 9/290 [01:04<33:51,  7.23s/it][A
  3%|▎         | 10/290 [01:11<33:49,  7.25s/it][A
  4%|▍         | 11/290 [01:19<33:49,  7.27s/it][A
  4%|▍         | 12/290 [01:26<33:50,  7.30s/it][A
  4%|▍         | 13/290 [01:33<33:48,  7.32s/it][A
  5%|▍         | 14/290 [01:41<33:46,  7.34s/it][A
  5%|▌         | 15/290 [01:48<33:45,  7.37s/it][A
  6%|▌         | 16/290 [01:56<33:44,  7.39s/it][A
  6%|▌         | 17/290 [02:03<33:48,  7.43s/it][A
  6%|▌         | 18/290 [02:11<33:46,  7.45s/it][A
  7%|▋         | 19/290 [02:18<33:43,  7.47s/it][A
  7%|▋         | 20/

1.3849236965179443



  0%|          | 1/290 [00:07<36:12,  7.52s/it][A
  1%|          | 2/290 [00:15<36:18,  7.56s/it][A
  1%|          | 3/290 [00:22<36:22,  7.60s/it][A
  1%|▏         | 4/290 [00:30<36:19,  7.62s/it][A
  2%|▏         | 5/290 [00:38<36:15,  7.63s/it][A
  2%|▏         | 6/290 [00:45<36:13,  7.65s/it][A
  2%|▏         | 7/290 [00:53<36:07,  7.66s/it][A
  3%|▎         | 8/290 [01:01<35:59,  7.66s/it][A
  3%|▎         | 9/290 [01:08<35:51,  7.66s/it][A
  3%|▎         | 10/290 [01:16<35:44,  7.66s/it][A
  4%|▍         | 11/290 [01:24<35:37,  7.66s/it][A
  4%|▍         | 12/290 [01:31<35:31,  7.67s/it][A
  4%|▍         | 13/290 [01:39<35:24,  7.67s/it][A
  5%|▍         | 14/290 [01:47<35:16,  7.67s/it][A
  5%|▌         | 15/290 [01:54<35:08,  7.67s/it][A
  6%|▌         | 16/290 [02:02<35:00,  7.67s/it][A
  6%|▌         | 17/290 [02:10<34:56,  7.68s/it][A
  6%|▌         | 18/290 [02:17<34:47,  7.68s/it][A
  7%|▋         | 19/290 [02:25<34:41,  7.68s/it][A
  7%|▋         | 20/

1.3024377822875977



  0%|          | 1/290 [00:07<36:05,  7.49s/it][A
  1%|          | 2/290 [00:15<36:13,  7.55s/it][A
  1%|          | 3/290 [00:22<36:17,  7.59s/it][A
  1%|▏         | 4/290 [00:30<36:17,  7.61s/it][A
  2%|▏         | 5/290 [00:38<36:14,  7.63s/it][A
  2%|▏         | 6/290 [00:45<36:12,  7.65s/it][A
  2%|▏         | 7/290 [00:53<36:08,  7.66s/it][A
  3%|▎         | 8/290 [01:01<36:00,  7.66s/it][A
  3%|▎         | 9/290 [01:08<35:53,  7.66s/it][A
  3%|▎         | 10/290 [01:16<35:46,  7.67s/it][A
  4%|▍         | 11/290 [01:24<35:38,  7.66s/it][A
  4%|▍         | 12/290 [01:31<35:31,  7.67s/it][A
  4%|▍         | 13/290 [01:39<35:23,  7.67s/it][A
  5%|▍         | 14/290 [01:47<35:15,  7.67s/it][A
  5%|▌         | 15/290 [01:54<35:08,  7.67s/it][A
  6%|▌         | 16/290 [02:02<34:59,  7.66s/it][A
  6%|▌         | 17/290 [02:10<34:55,  7.68s/it][A
  6%|▌         | 18/290 [02:17<34:47,  7.68s/it][A
  7%|▋         | 19/290 [02:25<34:42,  7.68s/it][A
  7%|▋         | 20/

1.2314636707305908



  0%|          | 1/290 [00:07<36:15,  7.53s/it][A
  1%|          | 2/290 [00:15<36:20,  7.57s/it][A
  1%|          | 3/290 [00:22<36:23,  7.61s/it][A
  1%|▏         | 4/290 [00:30<36:21,  7.63s/it][A
  2%|▏         | 5/290 [00:38<36:17,  7.64s/it][A
  2%|▏         | 6/290 [00:45<36:15,  7.66s/it][A
  2%|▏         | 7/290 [00:53<36:08,  7.66s/it][A
  3%|▎         | 8/290 [01:01<36:02,  7.67s/it][A
  3%|▎         | 9/290 [01:08<35:55,  7.67s/it][A
  3%|▎         | 10/290 [01:16<35:47,  7.67s/it][A
  4%|▍         | 11/290 [01:24<35:39,  7.67s/it][A
  4%|▍         | 12/290 [01:31<35:33,  7.67s/it][A
  4%|▍         | 13/290 [01:39<35:25,  7.67s/it][A
  5%|▍         | 14/290 [01:47<35:17,  7.67s/it][A
  5%|▌         | 15/290 [01:55<35:10,  7.67s/it][A
  6%|▌         | 16/290 [02:02<35:01,  7.67s/it][A
  6%|▌         | 17/290 [02:10<34:56,  7.68s/it][A
  6%|▌         | 18/290 [02:18<34:49,  7.68s/it][A
  7%|▋         | 19/290 [02:25<34:42,  7.68s/it][A
  7%|▋         | 20/

1.1769545078277588





In [None]:
pickle.dump(encoder,open('/content/drive/My Drive/Assignment4/encoder_pretrained.pk','wb'))
pickle.dump(decoder,open('/content/drive/My Drive/Assignment4/decoder_pretrained.pk','wb'))

In [None]:
encoder = pickle.load(open('/content/drive/My Drive/Assignment4/encoder_pretrained.pk','rb'))
decoder = pickle.load(open('/content/drive/My Drive/Assignment4/decoder_pretrained.pk','rb'))

In [None]:
def test(inputs,stop_index,states=None, max_len=30):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        output_sentence = []
        decoder.lstm.flatten_parameters()
        for i in range(max_len):
            lstm_outputs, states = decoder.lstm(inputs, states)
            lstm_outputs = lstm_outputs.squeeze(1)
            out = decoder.linear(lstm_outputs)
            last_pick = out.max(1)[1]
            if int(last_pick) == int(stop_index):
                break
            output_sentence.append(last_pick.item())
            inputs = decoder.embeding_layer(last_pick).unsqueeze(1)
        return output_sentence


In [None]:
key_list = list(vocab.vocab.keys()) 
val_list = list(vocab.vocab.values())
test_file = []

In [None]:
for path,subdir,f in os.walk('/content/drive/My Drive/Assignment4/private_test_images/private_test_images/'):
    for im in tqdm(f):
        image = transform(input_image_path='/content/drive/My Drive/Assignment4/private_test_images/private_test_images/'+im)
        features = encoder.forward(image.view(-1,3,224,224).to(device))
        features = torch.stack([features])
        out_test = test(features,vocab.vocab[vocab.end_word])
        out_string =  ""
        out_row = []
        out_row.append(int(im[6:-4]))
        # print (out_string)
        for i in range(1,len(out_test)-1):
            out_string += key_list[val_list.index(int(out_test[i]))]+" "
        out_string += key_list[val_list.index(int(out_test[len(out_test)-1]))] + "."
        out_row.append(out_string)
        # print(out_row)
        test_file.append(out_row)

100%|██████████| 1000/1000 [09:27<00:00,  1.76it/s]


In [24]:
with open('/content/drive/My Drive/Assignment4/2017CS10359_NA_private.tsv', 'w',encoding='utf8') as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in test_file:
        csvwriter.writerow([str(row[0])+"\t"+row[1]])