In [9]:
import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
import matplotlib.pyplot as plt

In [10]:
class ImgEncoder(nn.Module):

    def __init__(self, embed_size):
        super(ImgEncoder, self).__init__()
        model = models.vgg19(pretrained=True)
        in_features = model.classifier[-1].in_features # input size 
        # print("Size of the features ", in_features)
        # print("VGG input size ", model.classifier)
        model.classifier = nn.Sequential(*list(model.classifier.children())[:-1]) # remove output layer

        self.model = model # vgg19 without output layer
        self.fc = nn.Linear(in_features, embed_size)

    def forward(self, image):
        with torch.no_grad():
            img_feature = self.model(image)
        img_feature = self.fc(img_feature)
        l2_norm = img_feature.norm(p=2, dim=1, keepdim=True).detach()
        img_feature = img_feature.div(l2_norm) #1xn 1d vector

        # return the new encoding of the input image

        return img_feature

In [11]:
class QstEncoder(nn.Module):
    def __init__(self, vocab_size, featd, hidden_size, num_layers, out_size):
        super(QstEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, featd)
        self.tanh = nn.Tanh()
        self.lstm = nn.LSTM(featd, hidden_size, num_layers)
        self.fc = nn.Linear(2*num_layers*hidden_size, out_size)

    def forward(self, question):
        qst_vec = self.embedding(question) # [batchsize, max_qst_len=30, word_emb=300]
        # print("emb size ", qst_vec.shape)
        qst_vec = self.tanh(qst_vec) # -1, 1
        qst_vec = qst_vec.transpose(0, 1) #[max_qst_len=30, batchsize, word_emb=300]
        _, (hidden, cell) = self.lstm(qst_vec) #[num_layer=2, batchsize, hidden_size=512]
        qst_feature = torch.cat((hidden, cell), 2) # [num_layer=2, batchsize, 2*hiddensize=1024]
        qst_feature = qst_feature.transpose(0, 1) #[batchsize, num_layers, hiddensize]
        qst_feature = qst_feature.reshape(qst_feature.size()[0], -1) #[batch, featsize]
        qst_feature = self.tanh(qst_feature)
        qst_feature = self.fc(qst_feature)

        return qst_feature



In [12]:
class VqaModel(nn.Module):
    def __init__(self,vocab_size, feat_dim, hidden_size, num_layers, out_size ):
        super(VqaModel, self).__init__()
        self.img_enc = ImgEncoder(feat_dim)
        self.qst_enc = QstEncoder(vocab_size, feat_dim, hidden_size, num_layers, hidden_size)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(feat_dim, out_size)
        self.out = nn.Linear(out_size, out_size)
        self.outsoft = nn.Softmax()

    def forward(self, img, qst):
        img_feat = self.img_enc(img)
        qst_feat = self.qst_enc(qst)
        combined_feat = torch.mul(img_feat, qst_feat)
        combined_feat = self.tanh(combined_feat)
        combined_feat = self.dropout(combined_feat)
        combined_feat = self.fc1(combined_feat)
        combined_feat = self.dropout(combined_feat)
        output_probs = self.out(combined_feat) #[batch_size, vocab_size]
        softmaxout = self.outsoft(output_probs)
        return output_probs, softmaxout

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
qst_vocab_size = 17856
ans_vocab_size = 1000

embed_size = 1204
word_embed_size = 300
num_layers = 2
hidden_size = 300
qamodel = VqaModel(vocab_size=qst_vocab_size, feat_dim=word_embed_size, hidden_size=hidden_size, num_layers=num_layers, out_size=ans_vocab_size).to(device)

In [14]:
qamodel.load_state_dict(torch.load('modelsv2/best_coco_base_model.pt'))

<All keys matched successfully>

In [15]:
from data_loader import get_loader
import numpy as np
import sys

np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True)

# Set the recursion limit to a higher value temporarily
sys.setrecursionlimit(50000)
# xtrain = np.load("COCO-2015/datasets/train_drive_April_27.npy")
data_loader = get_loader(
    input_dir='./COCO-2015/datasets',
    input_vqa_train='train_drive_April_28.npy',
    input_vqa_valid='train_drive_April_28.npy',
    max_qst_length=30,
    max_num_ans=10,
    batch_size=16,
    num_workers=1)
embed_size = 1204
word_embed_size = 300
num_layers = 2
hidden_size = 300
qst_vocab_size = data_loader['train'].dataset.qst_vocab.vocab_size
ans_vocab_size = data_loader['train'].dataset.ans_vocab.vocab_size

In [16]:
print(f'Vocab size Qst {qst_vocab_size}')
print(f'Vocab size Ans {ans_vocab_size}')

Vocab size Qst 17856
Vocab size Ans 1000


In [17]:
import torch.optim as optim
from torch.optim import lr_scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(qamodel.parameters(), lr=1e-3)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
early_stop_threshold = 3
best_loss = 99999
val_increase_count = 0
stop_training = False
prev_loss = 9999

In [18]:
import os
TOTAL_EPOCHS = 10
batch_size = 16
running_loss = 0
for epoch in range(TOTAL_EPOCHS):
    for phase in ['train', 'valid']:
        running_loss, running_corr_exp1, running_corr_exp2 = 0., 0, 0
        batch_step_size = len(data_loader[phase].dataset)/batch_size
        if phase == 'train':
            scheduler.step()
            qamodel.train()
        else:
            qamodel.eval()
        for batch_idx, batch_sample in enumerate(data_loader[phase]):
            image = batch_sample['image'].to(device) # 128 x 64 x 64 x 3
            question = batch_sample['question'].to(device) # 128 x 30
            label = batch_sample['answer_label'].to(device)
            multi_choice = batch_sample['answer_multi_choice']
            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == 'train'):
                output, soft_probs = qamodel(image, question) # [batch_size, ans_vocabsize=1000]
                loss = criterion(output, label)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            total = len(label)
            correct = (predicted == label).sum().item()
            acc = 100 * (correct/total)
            if batch_idx % 10 == 0:
                print('| {} SET | Epoch [{:02d}/{:02d}], Step [{:04d}/{:04d}], Loss: {:.4f}, Acc: {:.4f}'
                        .format(phase.upper(), epoch+1, TOTAL_EPOCHS, batch_idx, int(batch_step_size), loss.item(), acc), end = '\r')                
    epoch_loss = running_loss / batch_step_size

    print('| {} SET | Epoch [{:02d}/{:02d}], Loss: {:.4f} \n'.format(phase.upper(), epoch+1, TOTAL_EPOCHS, epoch_loss))

    # with open(os.path.join('logs', '{}-{}-log-epoch-{:02}.txt')
    #             .format('./models/', phase, epoch+1), 'w') as f:
    #     f.write(str(epoch+1) + '\t'
    #             + str(epoch_loss) + '\t')


    if phase == 'valid':
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(qamodel.state_dict(), os.path.join('modelsv2', 'best_drive_model_apr28.pt'))
        if epoch_loss > prev_loss:
            val_increase_count += 1
        else:
            val_increase_count = 0
        if val_increase_count >= early_stop_threshold:
            stop_training = True
        prev_loss = epoch_loss




| VALID SET | Epoch [01/10], Loss: 1.1640 15], Loss: 1.2065, Acc: 87.5000

| VALID SET | Epoch [02/10], Loss: 0.3429 15], Loss: 0.4085, Acc: 87.5000

| VALID SET | Epoch [03/10], Loss: 0.1894 15], Loss: 0.1942, Acc: 93.7500

| VALID SET | Epoch [04/10], Loss: 0.1699 15], Loss: 0.1761, Acc: 87.50000

| VALID SET | Epoch [05/10], Loss: 0.1358 15], Loss: 0.0755, Acc: 93.7500

| VALID SET | Epoch [06/10], Loss: 0.0812 15], Loss: 0.0798, Acc: 100.0000

| VALID SET | Epoch [07/10], Loss: 0.0621 15], Loss: 0.0595, Acc: 100.0000

| VALID SET | Epoch [08/10], Loss: 0.0452 15], Loss: 0.0425, Acc: 100.0000

| VALID SET | Epoch [09/10], Loss: 0.0359 15], Loss: 0.0214, Acc: 100.0000

| VALID SET | Epoch [10/10], Loss: 0.0372 15], Loss: 0.0166, Acc: 100.0000

