In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

import numpy as np
import scipy.misc
import torch.nn as nn
import word2vec
import scipy.signal
import time
import os
import pickle
# If we want proper CUDA debug info.
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from gensim.scripts.glove2word2vec import glove2word2vec

import torch
from torch.autograd import Variable
from gensim.models.keyedvectors import KeyedVectors

import torch.nn.functional as F

from matplotlib import rcParams
rcParams['axes.grid'] = False


In [4]:
# Number to int conversion
import inflect
p = inflect.engine()
p.number_to_words('99')



'ninety-nine'

In [5]:
glove_dir = "/Users/jawa/Desktop/mark_ra/falconet/models/resources/"
glove_dim = 200
glove_file = "glove.twitter.27B.{}d.txt".format(glove_dim)
glove2word2vec(glove_input_file=glove_dir+glove_file, word2vec_output_file="resources/gensim_glove_vectors.txt")
glove_model = KeyedVectors.load_word2vec_format("resources/gensim_glove_vectors.txt", binary=False)


In [6]:
'four' in glove_model

True

In [7]:
# pre process model_dir
model_dir = "data/processed/nans,2000_maxlength,26_minwcount,0_nlp,mcb_pad,left_trainsplit,train"
# aid_to_ans.pickle
aid_to_ans = pickle.load(open(os.path.join(model_dir,"aid_to_ans.pickle"),"rb"))
# ans_to_aid.pickle
ans_to_aid = pickle.load(open(os.path.join(model_dir,"ans_to_aid.pickle"),"rb"))
# testdevset.pickle
testdevset = pickle.load(open(os.path.join(model_dir,"testdevset.pickle"),"rb"))
# testset.pickle
testset = pickle.load(open(os.path.join(model_dir,"testset.pickle"),"rb"))
# trainset.pickle
trainset = pickle.load(open(os.path.join(model_dir,"trainset.pickle"),"rb"))
# valset.pickle
valset = pickle.load(open(os.path.join(model_dir,"valset.pickle"),"rb"))
# wid_to_word.pickle
wid_to_word = pickle.load(open(os.path.join(model_dir,"wid_to_word.pickle"),"rb"))
# word_to_wid.pickle
word_to_wid = pickle.load(open(os.path.join(model_dir,"word_to_wid.pickle"),"rb"))

In [8]:
len(testset)

447793

In [9]:
len_answers = len(aid_to_ans)
print(len_answers)

2000


In [10]:
trainset[0] 

{'answer': 'net',
 'answer_aid': 934,
 'answers': ['net'],
 'answers_aid': [934],
 'answers_count': [8],
 'answers_occurence': [['net', 8], ['netting', 1], ['mesh', 1]],
 'image_name': 'COCO_train2014_000000458752.jpg',
 'question': 'What is this photo taken looking through?',
 'question_id': 458752000,
 'question_length': 7,
 'question_wids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  7],
 'question_words': ['what',
  'is',
  'this',
  'photo',
  'taken',
  'looking',
  'through'],
 'question_words_UNK': ['what',
  'is',
  'this',
  'photo',
  'taken',
  'looking',
  'through'],
 'seq_length': 7}

In [11]:
class VisualQuestionsDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, ds,ans_to_aid,aid_to_ans,wid_to_word,word_to_wid,image_root_dir=None,transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.question_answer_ds =  ds
        self.ans_to_aid = ans_to_aid
        self.aid_to_ans = aid_to_ans
        self.wid_to_word = wid_to_word
        self.word_to_wid = word_to_wid
        self.image_root_dir = image_root_dir

    def __len__(self):
        return len(self.question_answer_ds)

    def __getitem__(self, idx):
        
        # self.image_root_dir
        image_feat = None
        question = self.question_answer_ds[idx]['question_wids']
        
        # question_vec = [glove_model[wid_to_word[x]] for x in question if x!=0 ]
        question_vec=[]
        for x in question:
            if x == 0:
                question_vec.append(glove_dim*[0])
            else:
                word = wid_to_word[x]
                if word.isdigit():
                    word = p.number_to_words(word)
                if word in glove_model:
                    question_vec.append(glove_model[word])
                else:
                    question_vec.append(glove_dim*[0])
                
        answer_id = self.question_answer_ds[idx]['answer_aid']
        y = np.zeros(len_answers)
        y[answer_id]=1
        question_vec = np.asarray(question_vec)
        
        return question_vec,y

In [23]:
train_dataset = VisualQuestionsDataset(trainset,ans_to_aid,aid_to_ans,wid_to_word,word_to_wid)
train_loader = torch.utils.data.DataLoader(train_dataset,  batch_size=5, shuffle=True,num_workers=4)

In [24]:
class VQA_BASIC_WITHOUT_IMAGE_MODEL(nn.Module):
    def __init__(self, input_size,hidden_size =2048,n_layers=2):
        super().__init__()
        self.rnn =  nn.LSTM(input_size = input_size, hidden_size = hidden_size , num_layers =n_layers)
        self.linear = nn.Linear(hidden_size,len_answers)
        self.sofmax = nn.Softmax()
    def forward(self,x,input=None):
        x,  final_state = self.rnn(x)
        # picking the last elemnt from the sequence as output to the fc
        x = x[-1,:,:]
        x = self.linear(x)
        x = self.sofmax(x)
        return x

In [25]:
model = VQA_BASIC_WITHOUT_IMAGE_MODEL(glove_dim,1024,2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_f = torch.nn.NLLLoss()

In [26]:
def train_step(sample_batch):
    model.train()
    inputs, target = sample_batch
    inputs = inputs.permute(1,0,2)
    inputs = Variable(inputs)
    #print('input_size',inputs.size())
    target = Variable(target)
    print('target_size',target.size())
    inputs = inputs.float()
    output = model(inputs)
    print('output size', output.size())
    print('target size', target.size())
    loss = loss_f(output,target.long())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.data[0]

In [27]:
def eval_step(sample_batch):
    model.eval()
    inputs, target = sample_batch
    inputs = inputs.permute(1,0,2)
    inputs = Variable(inputs)
    target = Variable(target)
    inputs = inputs.float()
    output = model(inputs)
    loss = loss_f(output,target)
    acc = accuracy(output,target)
    return (loss.data[0],acc.data[0]/target.size(0))

In [28]:
def accuracy(test,target):
    print(0)

In [None]:
num_steps = 5000
num_steps_per_summary = 250

for i_batch, sample_batch in enumerate(train_loader):    
    x,y = sample_batch
    train_step(sample_batch)
    if step % num_steps_per_summary == 0:
        train_loss = eval_step(sample_batch)
        val_loss = eval_step(sample_batch)
        steps.append(step)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f'Step {step:05d} / {num_steps:05d}. Train loss: {train_loss:.3f}. Val loss: {val_loss:.3f}.')
        print('Samples:', processed_samples())

In [436]:
word_to_wid['sparsely']

7649