In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import torchvision.io as io
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import random
from sentence_transformers import SentenceTransformer
import json
import base64
import numpy as np
import csv
import sys
import cv2
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import zlib
import time
import pickle
import mmap
import warnings
warnings.filterwarnings("ignore")

In [56]:
# Loading the data required
file1 = open('q_data.pkl', 'rb')
file2 = open('fg_data.pkl', 'rb')
file3 = open('bg_data.pkl', 'rb')

q_data = pickle.load(file1)
fg_data = pickle.load(file2)
bg_data = pickle.load(file3)

file1.close()
file2.close()
file3.close()

In [96]:
count = {}
for i in range(16):
    count[i] = 0
for i in q_data:
    count[i['answer']] += 1

In [99]:
total = 0
for i in count:
    total += count[i]

In [101]:
for i in count:
    count[i] /= total

In [103]:
for i in count:
    print(count[i])

0.27944702228582813
0.1614966397899039
0.28181502983984597
0.11429878206215835
0.06407636639937443
0.03292341966847894
0.021607146809091378
0.013012976091238372
0.010394151537729516
0.006218786193261875
0.00503109393096631
0.0034597991988609958
0.0030171809644651327
0.0015934256438251068
0.0012688389386014739
0.00033934064637016165


In [4]:
# # Extracting image features for each image

# csv.field_size_limit(sys.maxsize)
   
# FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
# infile = 'trainval_resnet101_faster_rcnn_genome_36.tsv'

# # Verify we can read a tsv
# in_data = {}
# c = 0
# with open(infile, "r") as tsv_in_file:
#     reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
#     for item in reader:
#         c += 1
#         print(c)
#         item['image_id'] = int(item['image_id'])
#         item['image_h'] = int(item['image_h'])
#         item['image_w'] = int(item['image_w'])   
#         item['num_boxes'] = int(item['num_boxes'])
#         for field in ['boxes', 'features']:
#             item[field] = np.frombuffer(base64.b64decode(item[field]), 
#                   dtype=np.float32).reshape((item['num_boxes'],-1))
#         in_data[item['image_id']] = item

In [5]:
# # Defining model for extracting background features

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# resnet = models.resnet152(pretrained = True)
# newmodel = torch.nn.Sequential(*(list(resnet.children())[:-2]))
# net = nn.Conv2d(2048, 2048, kernel_size = 4, stride = (1, 1), bias = False)
# resnet_bg = nn.Sequential(newmodel, net).to(device)

In [6]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
# sentence_embeddings = sbert_model.encode("How many dogs are present in this array?")

In [85]:
# Main architecture of the model 

class tallyQA(nn.Module):
    def __init__(self):
        super(tallyQA, self).__init__()
        I_CNN = 2048
        Q_GRU_out = 768
        Q_embedding = 300
        LINsize = 1024
        Boxcoords = 16
        self.Ncls = 16
        
        layers_g1 = [ nn.Linear( 2*I_CNN + Q_GRU_out, LINsize),
                       nn.ReLU(inplace=True),
                       nn.Dropout(0.5),
                       nn.Linear( LINsize, LINsize),
                       nn.ReLU(inplace=True),
                       nn.Dropout(0.5),
                       nn.Linear(LINsize,LINsize),
                       nn.ReLU(inplace=True)]

        self.g1 = nn.Sequential(*layers_g1)
        self.g2 = nn.Sequential(*layers_g1)

        D = LINsize//2
        
        layers_f = [nn.Linear(LINsize, LINsize),
                    nn.ReLU(inplace = True),
                    nn.Linear(LINsize, LINsize)]
        
        layers_f1 = [nn.Linear(2 * LINsize,D),
                      nn.ReLU(inplace=True),
                      nn.Dropout(0.5),
                      nn.Linear(D,self.Ncls)]

        self.f1 = nn.Sequential(*layers_f)
        self.f2 = nn.Sequential(*layers_f)
        self.gamma = nn.Sequential(*layers_f1)
    
    def forward(self, foreground, background, sentence_emb):
        sum1 = torch.zeros(foreground.shape[0], 1024).to(device)
        sum2 = torch.zeros(foreground.shape[0], 1024).to(device)
        for i in range(foreground.shape[1]):
            for j in range(foreground.shape[1]):
                if i != j:
                    f = torch.cat((foreground[:, i, :], foreground[:, j, :], sentence_emb), 1)
                    sum1.add(self.g1(f))

        for i in range(background.shape[1]):
            for j in range(foreground.shape[1]):
                t = torch.cat((foreground[:, j, :], background[:, i, :], sentence_emb), 1)
                sum2.add(self.g2(t))
        
        sum1 = self.f1(sum1)
        sum2 = self.f2(sum2)
        
        return self.gamma(torch.cat((sum1, sum2), 1))

In [8]:
# # Visualising bouding boxes

# images = glob.glob('train2014/*')

# i = 10
# l = 31
# iid = int(images[i].split('_')[-1].split('.')[0])
# box = in_data[iid]['boxes'][l]

# boxes = [int(s) for s in box]
# k = cv2.cvtColor(cv2.imread(images[i]), cv2.COLOR_BGR2RGB)

# a = cv2.rectangle(k,(boxes[0],boxes[1]),(boxes[2], boxes[3]),(0,255,0),2)
# plt.imshow(a)

In [86]:
class TallyDataset(Dataset):
    
    def __init__(self, foreground_data, question_data, background_data):
        self.fg_data = foreground_data
        self.q_data = question_data
        self.bg_data = background_data
        
    def __len__(self):
        return len(self.q_data)
        
    def __getitem__(self, idx):
        qe = torch.from_numpy(sbert_model.encode(self.q_data[idx]['question']))
        iid = self.q_data[idx]['image_id']
        ff = torch.from_numpy(self.fg_data[iid]['features'])
        bf = self.bg_data[iid]
        label = self.q_data[idx]['answer']
        return ff, bf, qe, label

In [87]:
dataset = TallyDataset(fg_data, new_q_data, bg_data)

In [88]:
n = len(dataset)
l = list(range(n))
random.shuffle(l)

train_indices = l[:int(n * 0.7)]
val_indices = l[int(n * 0.7) : int(n * 0.85)]
test_indices = l[int(n * 0.85):]

In [89]:
batch_size = 16

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)

In [90]:
# criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCEWithLogitsLoss()
# criterion = FocalLoss()
model = tallyQA()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
model.to("cuda")

tallyQA(
  (g1): Sequential(
    (0): Linear(in_features=4864, out_features=1024, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=1024, out_features=1024, bias=True)
    (4): ReLU(inplace=True)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU(inplace=True)
  )
  (g2): Sequential(
    (0): Linear(in_features=4864, out_features=1024, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=1024, out_features=1024, bias=True)
    (4): ReLU(inplace=True)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): ReLU(inplace=True)
  )
  (f1): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (f2): Sequential(
    (0): Linear(in_features=1024, o

In [91]:
# new_q_data = []
# count = {}

# for i in range(16):
#     count[i] = 0

# for i in q_data:
#     if count[i['answer']] > 500:
#         continue
#     else:
#         new_q_data.append(i)
#         count[i['answer']] += 1

In [92]:
# weights = [0.27944702228582813, 0.1614966397899039, 0.28181502983984597, 0.11429878206215835, 0.06407636639937443, 0.03292341966847894, 0.021607146809091378, 0.013012976091238372, 0.010394151537729516, 0.006218786193261875, 0.00503109393096631, 0.0034597991988609958, 0.0030171809644651327, 0.0015934256438251068, 0.0012688389386014739, 0.00033934064637016165]
# weights = torch.from_numpy(np.array(weights)).to(device)

In [None]:
from tqdm import tqdm
epochs = 20

history = []
total_loss_l = []
device = "cuda"

for k in tqdm(range(1, epochs + 1)):
    total_loss = 0
    dev_total_loss = 0  
    for idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        target = model(batch[0].to(device), batch[1].to(device), batch[2].to(device))
        print('Answer : ', batch[3])
#         ans = torch.from_numpy(np.array(batch[3]))
        ans = F.one_hot(torch.from_numpy(np.array(batch[3])), num_classes = 16)
        print('Prediction : ', torch.argmax(target, dim = 1))
        loss = criterion(target, ans.float().to(device))
        history.append(loss.item())
        total_loss += loss.item()
        print("Loss : {}, Batch Number : {}".format(loss.item() / batch_size, idx))
        loss.backward()
        optimizer.step()
    print(total_loss)
    total_loss_l.append(total_loss)

  0%|          | 0/20 [00:00<?, ?it/s]

Answer :  tensor([11, 12,  5,  9,  2,  1,  5,  1, 10, 10,  1,  1,  2,  8,  3,  3])
Prediction :  tensor([14, 14, 14, 14, 14, 14, 14,  5, 14,  7,  5, 14, 14,  5, 14, 14],
       device='cuda:0')
Loss : 0.042870089411735535, Batch Number : 0
Answer :  tensor([10,  0, 11,  9,  4,  2,  8,  2, 12,  9, 12,  7,  1, 10,  1,  6])
Prediction :  tensor([10,  5,  2, 11,  2,  7, 11,  2, 14,  5,  2,  7,  5, 11, 15,  5],
       device='cuda:0')
Loss : 0.04158645123243332, Batch Number : 1
Answer :  tensor([ 1,  4, 11, 11,  6,  6,  8,  1,  7, 10,  9,  2,  8,  8, 12,  0])
Prediction :  tensor([11, 11,  5, 11,  1, 14, 11, 10, 11,  5, 11, 11,  5, 11,  5,  2],
       device='cuda:0')
Loss : 0.0399036630988121, Batch Number : 2
Answer :  tensor([11,  0,  9,  9,  0,  5, 12,  5,  4,  0,  0,  4,  4, 10, 13,  6])
Prediction :  tensor([15, 11,  2,  5, 11,  1, 11, 10, 11, 11, 11, 11, 11,  4, 11, 10],
       device='cuda:0')
Loss : 0.03710630536079407, Batch Number : 3
Answer :  tensor([ 7,  2,  1,  3, 11, 13,  9

Answer :  tensor([ 4,  2,  2, 13,  4,  6,  5,  2, 11,  9,  8,  4, 10, 12,  8,  7])
Prediction :  tensor([ 4,  1,  4,  4,  3,  4, 11,  1,  4, 11,  4,  7,  7,  7,  4, 10],
       device='cuda:0')
Loss : 0.015134683810174465, Batch Number : 35
Answer :  tensor([ 8,  0,  5, 10,  9, 12,  0,  8, 12,  8,  7,  2,  8,  9, 10,  3])
Prediction :  tensor([ 4,  2,  6, 11,  2,  6, 12,  4,  4,  0,  2,  7,  4, 11,  4,  4],
       device='cuda:0')
Loss : 0.01668776012957096, Batch Number : 36
Answer :  tensor([11,  7,  2,  0,  7,  4, 10, 12, 12,  8,  9,  5,  6,  1, 13, 14])
Prediction :  tensor([4, 0, 6, 6, 6, 2, 4, 4, 2, 2, 9, 4, 4, 0, 4, 4], device='cuda:0')
Loss : 0.01608162745833397, Batch Number : 37
Answer :  tensor([ 4,  2,  0,  2, 13, 10, 12,  6,  5, 12, 11,  3,  2, 11, 15,  0])
Prediction :  tensor([2, 4, 2, 4, 6, 2, 6, 4, 4, 0, 2, 4, 4, 0, 4, 4], device='cuda:0')
Loss : 0.015234868973493576, Batch Number : 38
Answer :  tensor([ 0, 14,  8,  5,  4,  0,  6,  2,  8,  6,  1,  1, 12,  7,  2,  0])
P

Answer :  tensor([13,  9,  6,  6,  4,  4,  2, 10,  3,  5,  6,  6,  4,  4, 10, 11])
Prediction :  tensor([11,  0,  5,  1,  3,  8,  4,  1,  6, 11,  1,  4, 11,  8,  5,  8],
       device='cuda:0')
Loss : 0.01509128138422966, Batch Number : 70
Answer :  tensor([12,  4,  3,  0, 11,  9, 13,  4, 10,  6,  6,  1,  2,  3,  4,  6])
Prediction :  tensor([11,  8,  8,  4,  1,  8,  0,  8,  8,  8,  3,  4,  4,  1,  0,  8],
       device='cuda:0')
Loss : 0.014816218987107277, Batch Number : 71
Answer :  tensor([12,  2,  2,  9,  1, 12, 10,  6,  1, 14,  0, 14,  3, 11,  3,  3])
Prediction :  tensor([11,  8,  4,  4,  6,  1,  4,  6,  0, 11,  4,  1,  5,  8,  4,  0],
       device='cuda:0')
Loss : 0.014547810889780521, Batch Number : 72
Answer :  tensor([ 6,  1,  7,  0,  3, 10,  4,  6,  8,  2,  6,  9,  6,  4,  0,  0])
Prediction :  tensor([ 0,  1,  4,  8,  8,  4,  4,  3,  6,  4,  4, 11,  4,  0,  1,  1],
       device='cuda:0')
Loss : 0.014352530241012573, Batch Number : 73
Answer :  tensor([ 3, 10,  4,  6,  4,

Answer :  tensor([ 7, 12,  9, 10,  2, 11, 10, 10, 11, 13,  3,  6, 13,  4,  7,  1])
Prediction :  tensor([ 7,  1, 11, 11, 11,  1, 11, 11,  5,  7,  8, 11, 11, 11, 11,  8],
       device='cuda:0')
Loss : 0.014789354056119919, Batch Number : 105
Answer :  tensor([ 3,  5,  0,  1, 12,  4,  4,  0,  2, 14,  7,  0,  8, 14,  4,  4])
Prediction :  tensor([ 7, 11, 11, 11, 11, 11, 11, 11,  1, 10,  8,  5,  7, 11,  7,  1],
       device='cuda:0')
Loss : 0.01521175354719162, Batch Number : 106
Answer :  tensor([ 0,  2,  4,  6,  2,  6,  3,  3,  8,  7, 11, 13, 11,  9,  5,  0])
Prediction :  tensor([ 5,  7,  9, 11,  8,  5,  8,  7, 11,  7,  5, 12, 11, 11, 11,  7],
       device='cuda:0')
Loss : 0.014965638518333435, Batch Number : 107
Answer :  tensor([ 6,  0,  3, 13,  3,  3, 11,  4, 10,  2,  0,  2,  2,  1,  8,  0])
Prediction :  tensor([11, 11, 11,  7, 11, 11, 11, 11,  7,  7,  1,  7, 11,  8,  8,  9],
       device='cuda:0')
Loss : 0.0149496179074049, Batch Number : 108
Answer :  tensor([ 3,  5,  3,  6, 1

Answer :  tensor([15,  3,  7, 11,  2,  6, 10,  8, 10,  6,  4, 12, 10,  8,  8,  8])
Prediction :  tensor([6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 5, 3], device='cuda:0')
Loss : 0.015189919620752335, Batch Number : 140
Answer :  tensor([13, 10, 10,  7, 10, 12,  4,  3,  6, 14,  8,  7,  4,  6, 10,  0])
Prediction :  tensor([ 3,  9,  3,  3,  3,  6,  0,  3,  7,  3,  3,  3,  3,  0, 10,  4],
       device='cuda:0')
Loss : 0.014700943604111671, Batch Number : 141
Answer :  tensor([ 6, 10,  7, 10,  6,  6, 10,  6,  5, 12,  2,  8, 12, 10,  0,  7])
Prediction :  tensor([3, 3, 6, 4, 4, 6, 3, 8, 3, 3, 3, 1, 5, 3, 0, 5], device='cuda:0')
Loss : 0.014334278181195259, Batch Number : 142
Answer :  tensor([ 2,  2,  7,  2,  4,  7,  8,  6, 10,  3,  5, 12,  2,  1,  0,  2])
Prediction :  tensor([4, 4, 4, 3, 3, 6, 0, 4, 0, 7, 7, 3, 1, 3, 4, 3], device='cuda:0')
Loss : 0.015005810186266899, Batch Number : 143
Answer :  tensor([ 3,  3, 13,  5, 11, 11,  6,  0,  8, 13,  0,  8,  1, 11,  3,  4])
Prediction :  tens

Answer :  tensor([ 1, 10,  8,  9,  3,  4, 10,  3, 13, 11,  2,  5,  2, 10,  7,  9])
Prediction :  tensor([ 1,  1,  7, 11, 11, 11, 11,  2,  7,  1,  2,  7, 11, 11, 11,  7],
       device='cuda:0')
Loss : 0.014459320344030857, Batch Number : 175
Answer :  tensor([ 2,  0,  9,  9,  1,  7, 10,  6, 11,  6,  1,  3,  9,  7,  2,  0])
Prediction :  tensor([ 1,  7,  2,  1,  1,  7,  4,  1,  1,  1,  1,  2,  7,  2,  1, 11],
       device='cuda:0')
Loss : 0.013643164187669754, Batch Number : 176
Answer :  tensor([ 6, 13, 10,  8,  7,  5, 10,  3, 12,  7,  2, 14,  5,  9,  6,  6])
Prediction :  tensor([11,  6,  0,  7,  7,  4,  0,  0,  5,  1, 11,  1,  2,  7,  1,  1],
       device='cuda:0')
Loss : 0.014921611174941063, Batch Number : 177
Answer :  tensor([ 0,  7,  7, 13,  2,  6,  1,  2, 15,  2,  6, 13, 13, 10,  0,  8])
Prediction :  tensor([11,  1,  9, 11,  1, 11,  0,  1,  2, 11,  0,  1,  2,  7,  1,  2],
       device='cuda:0')
Loss : 0.01546761766076088, Batch Number : 178
Answer :  tensor([ 9,  8,  2, 14,

Answer :  tensor([ 2,  9,  4, 12,  5,  5,  0,  2,  3,  5,  3,  1, 11,  3,  2,  8])
Prediction :  tensor([11,  9, 11,  8, 10,  8, 10, 10,  8,  8,  3, 11,  9, 11,  8,  5],
       device='cuda:0')
Loss : 0.014094367623329163, Batch Number : 210
Answer :  tensor([ 3, 12,  9,  7,  1, 11,  6,  1,  8,  7,  9,  2,  5, 10,  3,  8])
Prediction :  tensor([ 8, 10, 11, 10,  7,  5, 10, 11, 11, 10,  7, 10, 10,  1, 10,  1],
       device='cuda:0')
Loss : 0.014229647815227509, Batch Number : 211
Answer :  tensor([ 7,  8,  9,  7, 11,  8, 12,  4,  1,  9, 12,  4, 11,  8,  2,  8])
Prediction :  tensor([10,  8, 11,  2, 10,  9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 10],
       device='cuda:0')
Loss : 0.014120611362159252, Batch Number : 212
Answer :  tensor([12,  3, 10, 10,  5, 12,  8,  4,  0,  6,  7,  6,  3, 13,  5,  6])
Prediction :  tensor([10,  8,  5,  3, 10, 10,  8, 10,  8, 11, 10, 11,  5,  8,  8, 10],
       device='cuda:0')
Loss : 0.014476647600531578, Batch Number : 213
Answer :  tensor([ 8,  0, 12,  4

Answer :  tensor([ 0,  7, 12,  6,  7, 10,  0,  4,  1,  4, 10,  0, 13,  6,  2,  7])
Prediction :  tensor([ 3,  6, 10,  3,  5,  6,  3,  5,  3,  6, 10,  6, 10, 10,  6,  6],
       device='cuda:0')
Loss : 0.014789103530347347, Batch Number : 245
Answer :  tensor([ 5,  4,  4, 10,  7,  7,  9,  4,  5,  2,  4, 12,  7, 10, 11, 12])
Prediction :  tensor([6, 6, 7, 6, 3, 5, 3, 5, 3, 3, 3, 5, 6, 3, 3, 3], device='cuda:0')
Loss : 0.013830984011292458, Batch Number : 246
Answer :  tensor([9, 0, 1, 8, 5, 0, 4, 6, 7, 7, 6, 3, 9, 5, 0, 5])
Prediction :  tensor([10,  9,  6,  7,  6,  7,  3,  7,  6,  6,  6, 10,  7,  3,  7,  5],
       device='cuda:0')
Loss : 0.013871477916836739, Batch Number : 247
Answer :  tensor([ 2, 13,  9,  6,  6,  1, 14, 10,  1, 11,  7,  4,  3, 11,  2,  5])
Prediction :  tensor([ 5,  3,  5, 10,  6, 10,  7,  7,  5,  7,  7, 10,  7,  3,  9,  4],
       device='cuda:0')
Loss : 0.014607803896069527, Batch Number : 248
Answer :  tensor([ 7,  2, 13,  3,  0, 12,  0, 12,  9,  0,  6,  5,  2,  

Answer :  tensor([ 6,  3, 12,  3, 11,  7,  2,  8,  8, 11,  6, 10,  2, 12,  0,  0])
Prediction :  tensor([1, 3, 5, 0, 0, 6, 5, 2, 2, 0, 6, 0, 1, 5, 1, 1], device='cuda:0')
Loss : 0.013793593272566795, Batch Number : 280
Answer :  tensor([ 7, 12, 12, 11, 12,  5,  8, 11,  6,  6, 12,  4, 10,  1,  7,  6])
Prediction :  tensor([ 4,  2,  5,  5,  3,  2, 12,  0,  5,  7,  0,  5,  1,  1,  0,  1],
       device='cuda:0')
Loss : 0.014369668439030647, Batch Number : 281
Answer :  tensor([ 8,  4,  3, 12,  4,  0, 13,  8,  7,  2,  0,  4, 12, 13,  9,  6])
Prediction :  tensor([ 0,  8,  1,  1,  2,  2,  8,  2,  0,  0, 10,  0,  8,  2, 12,  2],
       device='cuda:0')
Loss : 0.015088606625795364, Batch Number : 282
