# Visual Question Answering Challenge

## Environment

In [None]:
!python -m spacy download en_vectors_web_lg
!pip install -U spacy

In [None]:
%load_ext tensorboard
import sys, warnings
warnings.filterwarnings("ignore")
from random import shuffle, sample
import pickle as pk
import gc
import tensorflow as tf
import json
import os
from datetime import datetime
from collections import defaultdict
import random
import numpy as np
import pandas as pd
import scipy.io
import spacy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Reshape
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Concatenate
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from PIL import Image

In [None]:
# Set the seed for random operations.
# This let our experiments to be reproducible.
SEED = 4242
tf.random.set_seed(SEED)
np.random.seed(4242)
random.seed(SEED)

# Set GPU memory growth
# Allows to only as much GPU memory as needed
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


## Data preprocessing

In [None]:
### Parameters ###

# Number of classes
num_classes = 13

# Json path and keys
cwd = os.getcwd()
DATASET_PATH = os.path.join(cwd, 'dataset_vqa')
TRAIN_IMG_PATH = os.path.join(INPUT_PATH, 'train')
TEST_IMG_PATH = os.path.join(INPUT_PATH, 'test')
TRAIN_JSON_PATH = os.path.join(INPUT_PATH, 'train_data.json')
TEST_JSON_PATH = os.path.join(INPUT_PATH, 'test_data.json')

QUESTION_KEY = 'question'
ANSWER_KEY = 'answer'
IMAGE_KEY = 'image_filename'

# Split ratio
train_split = 0.8


In [None]:
decide_class_indices = True

if decide_class_indices:
    classes = [
        '0',  # 0
        '1',  # 1
        '2',  # 2
        '3',  # 3
        '4',  # 4
        '5',  # 5
        '6',  # 6
        '7',  # 7
        '8',  # 8
        '9',  # 9
        '10',  # 10
        'yes',  # 11
        'no'  # 12
    ]
else:
    classes = None


In [None]:
# Opening json
with open(TRAIN_JSON_PATH, 'r') as f:
    SUBSET_data = json.load(f)
f.close()


# Splitting the question
questions = SUBSET_data.get("questions")

# This will be needed later for our vocabulary
whole_text = []
for question in questions:
    question_text = question.get(QUESTION_KEY)
    whole_text.append(question_text)

random.shuffle(questions)
split_index = int(train_split * len(questions))

# Getting the list of questions
train_questions = questions[:split_index]
valid_questions = questions[split_index:]

# Checking that all the questions are inside in the two sets
assert len(train_questions) + len(valid_questions) == len(questions)

# Checking that all the questions are successfully imported
assert len(questions) == 259492


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

# Generating the embedding matrix through GloVe encoding
# Getting all the tokens
token = Tokenizer()
token.fit_on_texts(whole_text)
seq = token.texts_to_sequences(whole_text)

# The matrix needs fixed lenght sequences (300 is a standard value
# for GloVe coefficients)
pad_seq = pad_sequences(seq, maxlen=300)

# the additional unit is for unkwown words, needed in the embedding
# layer for our RNN
vocab_size = len(token.word_index)+1
print("vocab size " + vocab_size)

# This object contains a series of utilities for
# nlp, it will come in handy when it will be time to
# convert words into GloVe coefficients
nlp = spacy.load('en_vectors_web_lg')

print("starting the embedding")
# Our beloved matrix
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tqdm(token.word_index.items()):
    embedding_coeff = nlp.vocab[word].vector
    if embedding_coeff is not None:
        embedding_array = np.array(embedding_coeff, dtype='float32')
        embedding_matrix[i] = embedding_array


In [None]:
from keras.utils import np_utils


def preprocess_qst(question):

    question_tokens = token.texts_to_sequences(question)
    question_seq = pad_sequences(question_tokens, maxlen=300)

    return (question_seq)


def preprocess_answ(answer):

    index = classes.index(answer)
    index = np.array(index)
    encoded_answ = np_utils.to_categorical(index, num_classes)

    return (encoded_answ)


In [None]:
from skimage.io import imread
from keras.applications.vgg16 import preprocess_input

# takes as input the question, returns corresponding image


def get_img(question, base_path):

    img_name = question.get(IMAGE_KEY)
    img_path = os.path.join(base_path, img_name)
    img = imread(img_path)

    return(img)

# preprocessing of the image


def preprocess_img(img, data_aug=False):

    # Resizing and rescaling the image
    img_array = np.array(img)
    img_array = preprocess_input(img_array)

    return(img_array)


In [None]:
# generator used to create batch of images/questions and corresponding answers

def data_generator(questions, batch_size=bs, data_aug=False):

    while True:

        # Select questions for the batch
        batch_questions = np.random.choice(a=questions,
                                           size=batch_size)
        batch_input = []
        batch_output = []

        # Read in each input, perform preprocessing and get labels
        for question in questions:

            # Recovering the image through the corresponding image name in
            # the question.
            # This will be our X_img, X_question set

            img = get_img(question, TRAIN_IMG_PATH)

            # Image preprocessing
            img_array = preprocess_img(img, data_aug=data_aug)

            # Question text preprocessing
            question_text = question.get(QUESTION_KEY)
            question_array = preprocess_qst(question_text)

            input = [img_array, question_array]

            # The expected output is the corresponding answer to the question
            answer_str = question.get(ANSWER_KEY)
            output = preprocess_answ(answer_str)

            batch_input += [input]
            batch_output += [output]

        # Return a tuple of (input,output) to feed the network
        batch_x = np.array(batch_input)
        batch_y = np.array(batch_output)

        yield(batch_x, batch_y)


## Building the model

In [None]:
######################## Network Parameters ####################################
bs = 8
img_dim = 4096
word2vec_dim = 300

######################## Training Parameters ####################################
num_epochs = 300
loss = tf.keras.losses.CategoricalCrossentropy()
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
metrics = ['accuracy']

# Others
img_w = 224
img_h = 224


In [None]:
# Image Model (VGG).... The last two layers are popped so later the
# the network can be combined with the language related one
vgg16_model = tf.keras.applications.VGG16(
    weights='imagenet', include_top=True, input_shape=(img_h, img_w, 3))
vgg16_model.summary()

image_model = Sequential()
for layer in vgg16_model.layers[:-2]:  # just exclude last layer from copying
    image_model.add(layer)

image_model.add(Dense(2048, activation='sigmoid'))
image_model.add(Dense(1024, activation='sigmoid'))
image_model.summary()

# Language Model (LSTM)
question_input = tf.keras.layers.Embedding(vocab_size, 300, weights=[embedding_matrix],
                                           input_length=300, trainable=False)

language_model = Sequential()
language_model.add(question_input)
language_model.add(LSTM(1024))

language_model.summary()

# VQA model = CNN + RNN (LSTM)
mult = tf.keras.layers.Multiply()([image_model.output, language_model.output])
x = Dropout(0.2)(mult)
x = Dense(512, activation='sigmoid')(x)
x = Dropout(0.2)(x)
out = Dense(num_classes, activation='softmax')(x)

vqa_model = tf.keras.models.Model(
    [image_model.input, language_model.input], out)
vqa_model.summary()


In [None]:
vqa_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


## Training

In [None]:
exps_dir = os.path.join(cwd, 'classification_experiments')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

model_name = 'CNN'

exp_dir = os.path.join(exps_dir, model_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)

callbacks = []

# Model checkpoint
# ----------------
ckpt_dir = os.path.join(exp_dir, 'ckpts')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

# ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'),
#                                                   save_weights_only=True)  # False to save the model directly
# callbacks.append(ckpt_callback)

# Visualize Learning on Tensorboard
# ---------------------------------
tb_dir = os.path.join(exp_dir, 'tb_logs')
if not os.path.exists(tb_dir):
    os.makedirs(tb_dir)
    print(tb_dir)

# By default shows losses and metrics for both training and validation
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir,
                                             profile_batch=0,
                                             histogram_freq=1)  # if 1 shows weights histograms
callbacks.append(tb_callback)


# Early Stopping
# --------------
early_stop = True
if early_stop:
    es_callback = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10)
    callbacks.append(es_callback)


vqa_model.fit(generator=data_generator(train_questions),
              epochs=num_epochs,
              steps_per_epoch=len(train_questions)//bs,
              validation_steps=len(valid_questions)//bs,
              callbacks=callbacks,
              validation_data=data_generator(valid_questions))


## Testing

In [None]:
import os
from datetime import datetime


def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')


In [None]:
with open(TEST_JSON_PATH, 'r') as f:
    SUBSET_test = json.load(f)
f.close()

test_questions = SUBSET_test.get('questions')

results = {}
for question in test_questions:

    img = get_img(question, TEST_IMG_PATH)

    # Image preprocessing
    img_array = preprocess_img(img)

    # Question text preprocessing
    question_id = question.get(ID)
    question_text = question.get(QUESTION)
    question_array = preprocess_qst(question)

    x_test = [img_array, question_array]

    prediction = np.argmax(model.predict(x=x_test))
    results[question_id] = prediction

print("CSV done!")
create_csv(results)
