In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import os
import pickle


# CNN Encoder
It encodes the input image into a 1D vector. The input image is of size 224*224.There are three convolutional and one fully connected linear layer. 

In [2]:
class EncoderCNN(nn.Module):
    def convs(self,x):
            x = F.max_pool2d(F.relu(self.conv1(x)),(3,3))
            x = F.max_pool2d(F.relu(self.conv2(x)),(3,3))
            x = F.max_pool2d(F.relu(self.conv3(x)),(3,3))
            self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
            # print(self._to_linear)
            return x

    def forward(self,x):
        x = self.convs(x)
        x = x.view(-1,self._to_linear)
        x = F.relu(self.fc1(x))
        return x

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3,32,5)
        self.conv2 = nn.Conv2d(32,64,5)
        self.conv3 = nn.Conv2d(64,128,5)
        self._to_linear = None
        x = torch.randn(3,224,224).view(-1,3,224,224)
        self.convs(x)
        # print(self._to_linear)
        self.fc1 = nn.Linear(self._to_linear,512)
encoder = EncoderCNN()

In [3]:
class DecoderRNN(nn.Module):
    def __init__(self,embed_size,hidden_size,vocab_size,num_layers = 2):
        super().__init__()
        self.embeding_layer = nn.Embedding(vocab_size,embed_size)
        self.lstm = nn.LSTM(input_size = embed_size,hidden_size = hidden_size,num_layers = num_layers,batch_first = True)
        self.linear = nn.Linear(hidden_size,vocab_size)
    
    def forward(self,features,captions):
        embed = self.embeding_layer(captions[:,:-1])
        # print(features.unsqueeze(1).shape,embed.shape)
        embed = torch.cat((features.unsqueeze(1),embed), dim = 1)
        lstm_outputs, _ = self.lstm(embed)
        out  = self.linear(lstm_outputs)
        return out


In [4]:
import nltk
nltk.download('punkt')
from collections import Counter
import numpy as np
from tqdm import tqdm
import csv
import string

class Vocabulary():
        
    def load_vocab(self,file_path,vocab_file):
        words = []
        
        # nltk.download('punkt')
        with open(file_path) as file:
            rows = csv.reader(file, delimiter="\t")
            for row in tqdm(rows):
                for i in range(5):
                    word_tokenized_list = nltk.tokenize.word_tokenize(row[i+1])
                    word_tokenized_no_punct = [x for x in word_tokenized_list if x not in string.punctuation]
                    words.extend(word_tokenized_no_punct)
        words_count = Counter(words)
        words_thresholded = [x for x in words_count if words_count[x]>10]
        words_thresholded.append(self.start_word)
        words_thresholded.append(self.end_word)
        words_thresholded.sort()
        # print(len(words_thresholded))
        vocab = {}
        for i, word in tqdm(enumerate(words_thresholded)):
            vocab.update({
                word: i
            })
        pickle.dump(vocab,open(vocab_file,'wb'))
        return vocab
    def __init__(self,train_captions = '/content/drive/My Drive/Assignment4/train_captions.tsv',
                 word_count_threshold = 5, vocab_file = '/content/drive/My Drive/Assignment4/vocab.pk'):
        self.vocab = None
        self.start_word = '<start>'
        self.end_word = '<end>'
        if os.path.exists(vocab_file):
            self.vocab = pickle.load(open(vocab_file,'rb'))
            print("loading from logs")
        else:
            self.vocab = self.load_vocab(train_captions,vocab_file)
        print("generating vocabulary")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
def word2vec(vocab,caption,maxl):
    tokens = []
    tokens.append(vocab.start_word)
    caption  = nltk.tokenize.word_tokenize(caption)
    caption = [x for x in caption if x not in string.punctuation]
    tokens.extend(caption)
    tokens.append(vocab.end_word)
    out = []
    for i,word in enumerate(tokens):
        if word in vocab.vocab.keys():
            out.append(vocab.vocab[word])
    while len(out)<maxl+2:
        out.append(vocab.vocab[vocab.end_word])
    out = torch.Tensor(out).to(device)
    return out 

In [6]:
from PIL import Image

def transform(input_image_path):
    transform  = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])])
    image = Image.open(input_image_path)
    out_image = transform(image)
    return out_image

In [7]:
import csv
def load_captions(captions_path = '/content/drive/My Drive/Assignment4/train_captions.tsv'):
    captions = {}
    indices = []
    with open(captions_path) as file:
        rows = csv.reader(file,delimiter = "\t")
        for row in tqdm(rows):
            captions[row[0]] = row[1:]
            indices.append(row[0])
    indices = np.array(indices)
    return indices,captions

In [8]:
embedding_size = 512
hidden_size = 512
num_epochs = 5
batch_size = 100
vocab  = Vocabulary()
vocab_size = len(vocab.vocab)
encoder = EncoderCNN()
decoder = DecoderRNN(embedding_size,hidden_size,vocab_size)
indices,captions = load_captions()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

0it [00:00, ?it/s]

loading from logs
generating vocabulary


29000it [00:02, 13016.42it/s]


DecoderRNN(
  (embeding_layer): Embedding(5217, 512)
  (lstm): LSTM(512, 512, num_layers=2, batch_first=True)
  (linear): Linear(in_features=512, out_features=5217, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

params = list(decoder.parameters()) + list(encoder.parameters())

optimizer = torch.optim.Adam(params = params, lr = 0.002)

In [None]:
num_batches = int(len(indices)/batch_size)

for i in tqdm(range(num_epochs)):
    for k in tqdm(range(num_batches)):
        train_indices = indices[k*batch_size:(k+1)*batch_size]
        images = []
        for image in train_indices:
            images.append(transform(input_image_path='/content/drive/My Drive/Assignment4/train_images/train_images/image_'+image+'.jpg'))
        images = torch.stack(images)
        images = images.to(device)
        for j in range(5):
            word_vec = []
            maxl = 0
            for index in train_indices:
                caption = nltk.tokenize.word_tokenize(captions[index][j])
                maxl = max(maxl,len(caption))
            # print(maxl)
            for index in train_indices:
                caption = captions[index][j]
                word_vec.append(word2vec(vocab,caption,maxl).long())
            word_vec = torch.stack(word_vec)
            word_vec.to(device)
            
            decoder.zero_grad()

            features = encoder(images)
            outputs = decoder(features, word_vec)

            
            loss = criterion(outputs.view(-1, vocab_size), word_vec.view(-1))
        
        # Backward pass.
            loss.backward()
        
        # Update the parameters in the optimizer.
            optimizer.step()
    
    print(loss.item())
        

In [None]:
pickle.dump(encoder,open('/content/drive/My Drive/Assignment4/encoder.pk','wb'))
pickle.dump(decoder,open('/content/drive/My Drive/Assignment4/decoder.pk','wb'))

In [13]:
encoder = pickle.load(open('/content/drive/My Drive/Assignment4/encoder.pk','rb'))
decoder = pickle.load(open('/content/drive/My Drive/Assignment4/decoder.pk','rb'))

In [11]:
def test(inputs,stop_index,states=None, max_len=30):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        output_sentence = []
        decoder.lstm.flatten_parameters()
        for i in range(max_len):
            lstm_outputs, states = decoder.lstm(inputs, states)
            lstm_outputs = lstm_outputs.squeeze(1)
            out = decoder.linear(lstm_outputs)
            last_pick = out.max(1)[1]
            if int(last_pick) == int(stop_index):
                break
            output_sentence.append(last_pick.item())
            inputs = decoder.embeding_layer(last_pick).unsqueeze(1)
        return output_sentence

In [10]:
key_list = list(vocab.vocab.keys()) 
val_list = list(vocab.vocab.values())
test_file = []

In [14]:
for path,subdir,f in os.walk('/content/drive/My Drive/Assignment4/private_test_images/private_test_images/'):
    for im in tqdm(f):
        image = transform(input_image_path='/content/drive/My Drive/Assignment4/private_test_images/private_test_images/'+im)
        features = encoder.forward(image.view(-1,3,224,224).to(device))
        features = torch.stack([features])
        out_test = test(features,vocab.vocab[vocab.end_word])
        out_string =  ""
        out_row = []
        out_row.append(int(im[6:-4]))
        # print (out_string)
        for i in range(1,len(out_test)-1):
            out_string += key_list[val_list.index(int(out_test[i]))]+" "
        out_string += key_list[val_list.index(int(out_test[len(out_test)-1]))] + "."
        out_row.append(out_string)
        # print(out_row)
        test_file.append(out_row)

100%|██████████| 1000/1000 [08:04<00:00,  2.06it/s]


In [16]:
with open('/content/drive/My Drive/Assignment4/2017CS10359_NA_private.tsv', 'w',encoding='utf8') as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in test_file:
        csvwriter.writerow([str(row[0])+"\t"+row[1]])

In [17]:
test_file = []
for path,subdir,f in os.walk('/content/drive/My Drive/Assignment4/public_test_images/public_test_images/'):
    for im in tqdm(f):
        image = transform(input_image_path='/content/drive/My Drive/Assignment4/public_test_images/public_test_images/'+im)
        features = encoder.forward(image.view(-1,3,224,224).to(device))
        features = torch.stack([features])
        out_test = test(features,vocab.vocab[vocab.end_word])
        out_string =  ""
        out_row = []
        out_row.append(int(im[6:-4]))
        # print (out_string)
        for i in range(1,len(out_test)-1):
            out_string += key_list[val_list.index(int(out_test[i]))]+" "
        out_string += key_list[val_list.index(int(out_test[len(out_test)-1]))] + "."
        out_row.append(out_string)
        # print(out_row)
        test_file.append(out_row)

with open('/content/drive/My Drive/Assignment4/2017CS10359_NA_public.tsv', 'w',encoding='utf8') as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in test_file:
        csvwriter.writerow([str(row[0])+"\t"+row[1]])

100%|██████████| 1014/1014 [08:06<00:00,  2.08it/s]
