In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
import matplotlib.pyplot as plt
import os


In [2]:
class ImgEncoder(nn.Module):

    def __init__(self, embed_size):
        super(ImgEncoder, self).__init__()
        model = models.vgg19(pretrained=True)
        in_features = model.classifier[-1].in_features # input size 
        # print("Size of the features ", in_features)
        # print("VGG input size ", model.classifier)
        model.classifier = nn.Sequential(*list(model.classifier.children())[:-1]) # remove output layer

        self.model = model # vgg19 without output layer
        self.fc = nn.Linear(in_features, embed_size)

    def forward(self, image):
        with torch.no_grad():
            img_feature = self.model(image)
        img_feature = self.fc(img_feature)
        l2_norm = img_feature.norm(p=2, dim=1, keepdim=True).detach()
        img_feature = img_feature.div(l2_norm) #1xn 1d vector

        # return the new encoding of the input image

        return img_feature

In [3]:
img_enc = ImgEncoder(1024)
img_test = np.random.rand(2,3, 32,32) * 255
print("Image shape ", img_test.shape)
img_test = torch.tensor(img_test).float()

img_feature = img_enc(img_test)


Image shape  (2, 3, 32, 32)


In [4]:
class QstEncoder(nn.Module):
    def __init__(self, vocab_size, featd, hidden_size, num_layers, out_size):
        super(QstEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, featd)
        self.tanh = nn.Tanh()
        self.lstm = nn.LSTM(featd, hidden_size, num_layers)
        self.fc = nn.Linear(2*num_layers*hidden_size, out_size)

    def forward(self, question):
        qst_vec = self.embedding(question) # [batchsize, max_qst_len=30, word_emb=300]
        # print("emb size ", qst_vec.shape)
        qst_vec = self.tanh(qst_vec) # -1, 1
        qst_vec = qst_vec.transpose(0, 1) #[max_qst_len=30, batchsize, word_emb=300]
        _, (hidden, cell) = self.lstm(qst_vec) #[num_layer=2, batchsize, hidden_size=512]
        qst_feature = torch.cat((hidden, cell), 2) # [num_layer=2, batchsize, 2*hiddensize=1024]
        qst_feature = qst_feature.transpose(0, 1) #[batchsize, num_layers, hiddensize]
        qst_feature = qst_feature.reshape(qst_feature.size()[0], -1) #[batch, featsize]
        qst_feature = self.tanh(qst_feature)
        qst_feature = self.fc(qst_feature)

        return qst_feature



In [5]:
qst = np.array([[1,2,3,4,5]])
qst = torch.tensor(qst).int()
qst_enc = QstEncoder(vocab_size=100, featd=128, hidden_size=512, num_layers=2, out_size=1024)
print("Model input size ", qst.shape)
qst_feat = qst_enc(qst)
print("Shape of question feat ", qst_feat.shape)

Model input size  torch.Size([1, 5])
Shape of question feat  torch.Size([1, 1024])


In [6]:
class VqaModel(nn.Module):
    def __init__(self,vocab_size, feat_dim, hidden_size, num_layers, out_size ):
        super(VqaModel, self).__init__()
        self.img_enc = ImgEncoder(feat_dim)
        self.qst_enc = QstEncoder(vocab_size, feat_dim, hidden_size, num_layers, hidden_size)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(feat_dim, out_size)
        self.out = nn.Linear(out_size, out_size)
        self.outsoft = nn.Softmax()

    def forward(self, img, qst):
        img_feat = self.img_enc(img)
        qst_feat = self.qst_enc(qst)
        combined_feat = torch.mul(img_feat, qst_feat)
        combined_feat = self.tanh(combined_feat)
        combined_feat = self.dropout(combined_feat)
        combined_feat = self.fc1(combined_feat)
        combined_feat = self.dropout(combined_feat)
        output_probs = self.out(combined_feat) #[batch_size, vocab_size]
        softmaxout = self.outsoft(output_probs)
        return output_probs, softmaxout

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
import numpy as np
import sys

np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True)

# Set the recursion limit to a higher value temporarily
sys.setrecursionlimit(50000)
# xtrain = np.load("COCO-2015/datasets/train_drive_April_27.npy")

In [9]:
from data_loader import get_loader
data_loader = get_loader(
    input_dir='./COCO-2015/datasets',
    input_vqa_train='train.npy',
    input_vqa_valid='valid.npy',
    max_qst_length=30,
    max_num_ans=10,
    batch_size=16,
    num_workers=1)
embed_size = 1204
word_embed_size = 300
num_layers = 2
hidden_size = 300
qst_vocab_size = data_loader['train'].dataset.qst_vocab.vocab_size
ans_vocab_size = data_loader['train'].dataset.ans_vocab.vocab_size

In [10]:
print(f'Vocab size Qst {qst_vocab_size}')
print(f'Vocab size Ans {ans_vocab_size}')

Vocab size Qst 17856
Vocab size Ans 1000


In [11]:
qamodel = VqaModel(vocab_size=qst_vocab_size, feat_dim=word_embed_size, hidden_size=hidden_size, num_layers=num_layers, out_size=ans_vocab_size).to(device)

In [12]:
#qamodel = torch.load('models/best_model.pt')

In [13]:
import torch.optim as optim
from torch.optim import lr_scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(qamodel.parameters(), lr=1e-3)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
early_stop_threshold = 3
best_loss = 99999
val_increase_count = 0
stop_training = False
prev_loss = 9999

In [14]:
TOTAL_EPOCHS = 50
batch_size = 16
running_loss = 0
for epoch in range(TOTAL_EPOCHS):
    for phase in ['train', 'valid']:
        running_loss, running_corr_exp1, running_corr_exp2 = 0., 0, 0
        batch_step_size = len(data_loader[phase].dataset)/batch_size
        if phase == 'train':
            scheduler.step()
            qamodel.train()
        else:
            qamodel.eval()
        for batch_idx, batch_sample in enumerate(data_loader[phase]):
            image = batch_sample['image'].to(device) # 128 x 64 x 64 x 3
            question = batch_sample['question'].to(device) # 128 x 30
            label = batch_sample['answer_label'].to(device)
            multi_choice = batch_sample['answer_multi_choice']
            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == 'train'):
                output, soft_probs = qamodel(image, question) # [batch_size, ans_vocabsize=1000]
                loss = criterion(output, label)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            total = len(label)
            correct = (predicted == label).sum().item()
            acc = 100 * (correct/total)
            if batch_idx % 10 == 0:
                print('| {} SET | Epoch [{:02d}/{:02d}], Step [{:04d}/{:04d}], Loss: {:.4f}, Acc: {:.4f}'
                        .format(phase.upper(), epoch+1, TOTAL_EPOCHS, batch_idx, int(batch_step_size), loss.item(), acc), end = '\r')                
    epoch_loss = running_loss / batch_step_size

    print('| {} SET | Epoch [{:02d}/{:02d}], Loss: {:.4f} \n'.format(phase.upper(), epoch+1, TOTAL_EPOCHS, epoch_loss))

    # with open(os.path.join('logs', '{}-{}-log-epoch-{:02}.txt')
    #             .format('./models/', phase, epoch+1), 'w') as f:
    #     f.write(str(epoch+1) + '\t'
    #             + str(epoch_loss) + '\t')


    if phase == 'valid':
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(qamodel.state_dict(), os.path.join('modelsv2', 'best_coco_base_model.pt'))
        if epoch_loss > prev_loss:
            val_increase_count += 1
        else:
            val_increase_count = 0
        if val_increase_count >= early_stop_threshold:
            stop_training = True
        prev_loss = epoch_loss




| VALID SET | Epoch [01/50], Loss: 2.5864 3397], Loss: 1.8578, Acc: 37.5000

| VALID SET | Epoch [02/50], Loss: 2.3704 3397], Loss: 1.9585, Acc: 43.7500

| VALID SET | Epoch [03/50], Loss: 2.2695 3397], Loss: 1.4388, Acc: 31.2500

| VALID SET | Epoch [04/50], Loss: 2.2222 3397], Loss: 2.3398, Acc: 56.2500

| VALID SET | Epoch [05/50], Loss: 2.1989 3397], Loss: 2.5075, Acc: 37.5000

| VALID SET | Epoch [06/50], Loss: 2.1717 3397], Loss: 2.2345, Acc: 37.5000

| VALID SET | Epoch [07/50], Loss: 2.1597 3397], Loss: 2.6038, Acc: 37.5000

| VALID SET | Epoch [08/50], Loss: 2.1431 3397], Loss: 2.3243, Acc: 43.7500

| VALID SET | Epoch [09/50], Loss: 2.1448 3397], Loss: 2.2134, Acc: 31.2500

| VALID SET | Epoch [10/50], Loss: 2.0479 3397], Loss: 1.3444, Acc: 37.5000

| VALID SET | Epoch [11/50], Loss: 2.0347 3397], Loss: 2.1239, Acc: 50.0000

| VALID SET | Epoch [12/50], Loss: 2.0304 3397], Loss: 2.0790, Acc: 43.7500

| VALID SET | Epoch [13/50], Loss: 2.0262 3397], Loss: 2.1723, Acc: 56.2500
