In [71]:
import json
import numpy as np
import os
import cv2
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model,Sequential
from keras.layers import Dense, Embedding, LSTM, Input,SpatialDropout1D,Conv2D,MaxPooling2D,Flatten,concatenate
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [6]:
with open('Quest_Answers.json') as f:
    data = json.load(f)

In [14]:
img_h = 120
img_w = 160
img_d = 3
num_of_output = 28

In [34]:
image_names = os.listdir('./images')
#image_names =image_names[:100]
name_to_image = {}
for image_name in image_names:
    img = cv2.imread('./images/'+image_name)
    if img is not None:
        #image pre-processing
        #img = cv2.resize(img,(img_h,img_w))
        name_to_image[image_name[:-4]] = img

In [110]:
img = []
quest = []
y = []
key_list = list(name_to_image.keys())
for ele in data['quest_answers']:
    if ele['Image'] in key_list and name_to_image[ele['Image']] is not None:
        quest.append(ele['Question'])
        y.append(ele['Answer'])  
        img.append(name_to_image[ele['Image']])

In [111]:
img = np.array(img)
quest = np.array(quest)
labels = np.array(y)

In [115]:
label_list = open('answer_clevr.txt').read().splitlines()

In [136]:
answers = np.zeros((len(labels),num_of_output))
label_list = np.array(label_list)
for i in range(len(labels)):
    answers[i][np.argwhere(label_list == labels[i])[0]]=1


In [139]:
#ext pre-processing
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ')
tokenizer.fit_on_texts(quest)
questions = tokenizer.texts_to_sequences(quest)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
max_length_of_text = 0

for q in questions:
    max_length_of_text = max(max_length_of_text,len(q))
questions = pad_sequences(questions, maxlen=max_length_of_text)

Found 80 unique tokens.


In [144]:
# First, let's define a vision model using a Sequential model.
# This model will encode an image into a vector.
vision_model = Sequential()
vision_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(img_h, img_w, img_d)))
vision_model.add(Conv2D(64, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
vision_model.add(Conv2D(128, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
vision_model.add(Conv2D(256, (3, 3), activation='relu'))
vision_model.add(Conv2D(256, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Flatten())

image_input = Input(shape=(img_h, img_w, img_d))
encoded_image = vision_model(image_input)

question_input = Input(shape=(max_length_of_text,), dtype='int32')
embedded_question = Embedding(input_dim=10000, output_dim=256, input_length=max_length_of_text)(question_input)
encoded_question = LSTM(256)(embedded_question)
merged = concatenate([encoded_question, encoded_image])
output = Dense(num_of_output, activation='softmax')(merged)
vqa_model = Model(inputs=[image_input, question_input], outputs=output)
print(vqa_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           (None, 39)           0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 39, 256)      2560000     input_20[0][0]                   
__________________________________________________________________________________________________
input_19 (InputLayer)           (None, 120, 160, 3)  0                                            
__________________________________________________________________________________________________
lstm_9 (LSTM)                   (None, 256)          525312      embedding_10[0][0]               
__________________________________________________________________________________________________
sequential

In [142]:
vqa_model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])


In [143]:
vqa_model.fit(x=[img,questions], y=answers, batch_size=32, epochs=10,validation_split=0.3)


Train on 700 samples, validate on 300 samples
Epoch 1/10
 96/700 [===>..........................] - ETA: 2:58 - loss: 10.8202 - acc: 0.1458  

KeyboardInterrupt: 