In [8]:
# Keras
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers.recurrent import LSTM

from keras import applications
from keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback

# others libraries
import spacy
import numpy as np
import collections
import operator
import json

In [2]:
# parameters of the neural_network
num_hidden_units_mlp = 1024
num_hidden_units_lstm = 512
img_dim = 4096
word_vec_dim = 300
max_len = 30
nb_classes = 1000

In [3]:
model = Sequential()
#model.add(LSTM(units = num_hidden_units_lstm, activation='tanh', 
#               return_sequences=True, input_shape=(max_len, word_vec_dim)))
#model.add(Dropout(0.5))
model.add(LSTM(num_hidden_units_lstm, activation='tanh', input_shape=(max_len, word_vec_dim)))
model.add(Dense(nb_classes, kernel_initializer='uniform'))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [4]:
# TODO: need to debug the installation of keras to be able to see the model in a better way
# from keras.utils import plot_model
# plot_model(model, to_file='LSTM_Q.png')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              513000    
_________________________________________________________________
activation_1 (Activation)    (None, 1000)              0         
Total params: 2,178,024.0
Trainable params: 2,178,024.0
Non-trainable params: 0.0
_________________________________________________________________


In [5]:
# loaded word2vec embeddings from spacy
word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')

In [6]:
def question_features(question):
    """
        Returns the embeddings (word2vec) of all the words of the question
        :param question string that represents the question
    """
    tokens = word_embeddings(question)
    features = np.zeros((len(tokens), 300))
       
    for i, token in enumerate(tokens):
        features[i, :] = token.vector
    
    return features

## Preprocess the datas

In [9]:
data_question = json.load(open('Questions/v2_OpenEnded_mscoco_train2014_questions.json'))
data_answer = json.load(open('Annotations/v2_mscoco_train2014_annotations.json'))

In [10]:
def preprocess_data(data_question, data_answer):
    data_ans = data_answer['annotations']
    data_ques = data_question['questions']    
    
    d = collections.defaultdict(dict)
    
    questions = []
    questions_len = []
    
    answers = []
    images_id = []
    questions_id = []
    
    for i in range(len(data_ques)):
        q_id = data_ques[i]['question_id']
        img_id = data_ques[i]['image_id']
        question = data_ques[i]['question']
        d[img_id][q_id] = [question,len(question.split()) + 1] # add one for the interrogation point
        
    for i in range(len(data_ans)):
        img_id = data_ans[i]['image_id']
        q_id = data_ans[i]['question_id']
        
        questions_id.append(q_id)
        images_id.append(img_id)
        answers.append(data_ans[i]['multiple_choice_answer'])
        questions.append(d[img_id][q_id][0])
        questions_len.append(d[img_id][q_id][1])
    
    return images_id, questions_id, answers, questions, questions_len

In [12]:
# TODO: don't need questions_len or maybe need to improve the training...
# return 5 arrays containing image_id, questions_id, answers (words), questions (sentences), questions_len (nb words in question)
images_id, questions_id, answers, questions, questions_len = preprocess_data(data_question, data_answer)

In [13]:
def topKFrequentAnswer(data_q, data_a, K=1000):
    """
        Returns the image_id, question_id, answers, questions and questions_len whose answers are in the K most frequent Answer
        param: data_q json file of questions
        param: data_a json file of answers
        param: K number of most frequent answers (K = 1000 by default as in the paper they use K = 1000)
    """
    
    images_id, questions_id, answers, questions, questions_len = preprocess_data(data_q, data_a)
    
    d = dict()
    
    # retrieve the top K answers
    for answer in answers:
        d[answer] = 0 if answer not in d else d[answer] + 1
    
    topKAnswers = np.array(sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:K])[:, 0]
    
    # keep only question_id, image_id, questions, questions_len associated with the topKAnswers
    new_images_id = []
    new_questions = []
    new_answers = []
    new_questions_len = []
    new_questions_id = []
    
    
    for a, q, q_id, q_len, img in zip(answers, questions, questions_id, questions_len, images_id):
        if a in topKAnswers:
            new_images_id.append(img)
            new_questions.append(q)
            new_questions_len.append(q_len)
            new_answers.append(a)
            new_questions_id.append(q_id)
        
    
    return new_images_id, questions_id, new_answers, new_questions, new_questions_len

In [14]:
K_images_id, K_questions_id, K_answers, K_questions, K_questions_len = topKFrequentAnswer(data_question, data_answer)

In [15]:
print(K_images_id[:10])
print(K_questions_id[:10])
print(K_answers[:10])
print(K_questions[:10])
print(K_questions_len[:10])

[458752, 458752, 458752, 458752, 262146, 262146, 262146, 524291, 524291, 524291]
[458752000, 458752001, 458752002, 458752003, 262146000, 262146001, 262146002, 524291000, 524291001, 524291002]
['net', 'pitcher', 'orange', 'yes', 'white', 'skiing', 'red', 'frisbee', 'yes', 'frisbee']
['What is this photo taken looking through?', 'What position is this man playing?', 'What color is the players shirt?', 'Is this man a professional baseball player?', 'What color is the snow?', 'What is the person doing?', 'What color is the persons headwear?', "What is in the person's hand?", 'Is the dog waiting?', 'Is the dog looking at a tennis ball or frisbee?']
[8, 7, 7, 8, 6, 6, 7, 7, 5, 11]


## training the model

In [16]:
# function to iterates using batches (from stackoverflow)
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [17]:
# atot stands for answers to tensors (transform batch of answers to one-hot encoding)
def atot(answers, encoder):
    y = encoder.transform(answers)
    nb_classes = encoder.classes_.shape[0]
    Y = np_utils.to_categorical(y, nb_classes)
    return Y

In [18]:
# qtot stands for questions to tensors (transform batch of questions to tensors using word2vec embeddings)
def qtot(questions, max_len):
    res = np.zeros((len(questions), max_len, 300)) # word2vec dimension = 300
    
    for i, question in enumerate(questions):
        q_word2vec = question_features(question)
        nb_words, _ = q_word2vec.shape
        res[i,:nb_words] = q_word2vec
        
    return res

In [None]:
from keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback
from keras.utils import np_utils
from tqdm import tqdm

# avoid 'Set changed size during iteration' bug
tqdm.monitor_interval = 0

from sklearn import preprocessing

from spacy.en import English
nlp = English()

# number of epochs that you would like to use to train the model.
epochs = 20

# batch size
batch_size = 128

# save the best weights during training.
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.from_scratch.hdf5', 
                               verbose=1, save_best_only=True)
# train the model
# to avoid memory overload we cannot use the below line with embeddings in memory (too big),
# so we need to iterates using batch !
# model.fit(X_train, y_train, 
#          validation_data=(X_val, y_val),
#          epochs=epochs, batch_size=128, callbacks=[checkpointer, TQDMNotebookCallback()], verbose=0)

labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(K_answers)
nb_classes = len(list(labelencoder.classes_))

f = lambda a: len(a.split())

# TODO: maybe add validation data and train on both train + validation (paper did that)
# 388158 questions to treat by epoch !

for k in tqdm(range(epochs), desc="Simulating {}".format("...")):
    i = 0
    # use batch of image, answer and image to train the network
    for batch_q, batch_img, batch_a in zip(batch(K_questions, batch_size), batch(K_images_id, batch_size), batch(K_answers, batch_size)):
        
        print(i)
        i += batch_size
        ## transform data to vectors/classes number
        
        # max number of words in a sentence by batch
        # max_words = f(max(batch_q, key = f)) + 1 # +1 to account for the '?'
        
        # qtot stands for questions to tensor
        X_batch = qtot(batch_q, 30)
        
        # atot stand for answers to tensor
        Y_batch = atot(batch_a, labelencoder)
        
        # train the model
        loss = model.train_on_batch(X_batch, Y_batch)
    
    # save weights at each epoch
    model.save_weights('LSTM_Q/LSTM_Q_epoch_{:02d}.hdf5'.format(k))

## Evaluate the model

### test on single question

In [None]:
data_test = json.load(open('Questions/v2_OpenEnded_mscoco_test-dev2015_questions.json'))

In [None]:
print("question: ", data_test['questions'][25]['question'])
question_feat = qtot([data_test['questions'][25]['question']], 30)
question_feat.shape

In [None]:
y_pred = np.argmax(model.predict(question_feat))

In [None]:
labelencoder.inverse_transform(y_pred)

### generate the json dictionary for evaluation

In [None]:
lstm_q = []
i = 0
for q in data_test['questions']:
    d = {}
    if i % 10000 == 0:
        print(i)
    # 30 = max_words in a question
    question_feat = qtot([q['question']], 30)
    d['answer'] = labelencoder.inverse_transform(np.argmax(model.predict(question_feat)))
    d['question_id'] = q['question_id']
    lstm_q.append(d)
    i += 1

with open('lstm_q.json', 'w') as outfile:
    json.dump(lstm_q, outfile)