## Imports

In [1]:
import os
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

## System checkup

In [2]:
# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 5231
tf.random.set_seed(SEED)  

# Get current working directory
cwd = os.getcwd()

# Set GPU memory growth 
# Allows to only as much GPU memory as needed
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


## Data preparation

In [3]:
# Batch size
bs = 6

# Validation size
val_size = 0.2

# img shape
img_w = 480
img_h = 320

# question shape
q_len = 100

# dictionary var
MAX_NUM_QUESTIONS = 5000
MAX_NUM_WORDS = 20000

# class shape
num_classes=13
classes = ['0',     #0
           '1',     #1
           '10',    #2
           '2',     #3
           '3',     #4
           '4',     #5
           '5',     #6
           '6',     #7
           '7',     #8
           '8',     #9
           '9',     #10
           'no',    #11
           'yes']   #12

In [4]:
import json

dataset_dir = os.path.join(cwd, 'dataset_vqa')

questions = [] # Array containing all training and validation questions
questions_test = []  # Array containing all test questions
train_data = [] # Array containing all training {question, image, answer} dict
valid_data = [] # Array containing all validation {question, image, answer} dict
test_data = [] # Array containing all test {question_id, image_filename, question} dict

with open(dataset_dir + '/train_data.json', 'r') as f:
    train_d = json.load(f)["questions"]
    if(MAX_NUM_QUESTIONS < len(train_d)):
        train_d = train_d[:MAX_NUM_QUESTIONS]
    questions = [e['question'] for e in train_d]
    valid_data = train_d[:int(len(train_d)*val_size)]
    train_data = train_d[int(len(train_d)*val_size):]
    print('Train questions: ' + str(len(train_data)))
    print('Validation questions: ' + str(len(valid_data)))

with open(dataset_dir + '/test_data.json', 'r') as f:
    test_data = json.load(f)["questions"]
    questions_test= [e['question'] for e in test_data]
    print('Test questions: ' + str(len(test_data)))    
    
#print(questions[:10])
f.close()

Train questions: 4000
Validation questions: 1000
Test questions: 3000


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

questions_all = questions + questions_test

# Create Tokenizer to convert words to integers
q_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
q_tokenizer.fit_on_texts(questions_all)
q_tokenized = q_tokenizer.texts_to_sequences(questions_all)

q_wtoi = q_tokenizer.word_index
print('Total question words:', len(q_wtoi))

max_q_length = max(len(sentence) for sentence in q_tokenized)
print('Max question length:', max_q_length)

Total question words: 70
Max question length: 39


In [6]:
# Pad to max question length
q_encoder_inputs = pad_sequences(q_tokenized, maxlen=max_q_length)
print("Question encoder inputs shape:", q_encoder_inputs.shape)

q_encoder_inputs_train = q_encoder_inputs[int(len(train_d)*val_size):int(len(train_d))]
q_encoder_inputs_valid = q_encoder_inputs[:int(len(train_d)*val_size)]
q_encoder_inputs_test = q_encoder_inputs[:int(len(train_d))]

Question encoder inputs shape: (8000, 39)


In [7]:
# Custom data generator
# data: json file with (question, image_name, answer) tuples
# batch_size: size of batches
import math
from skimage.io import imread

class VQASequence(utils.Sequence):
        def __init__(self, data, q, batch_size):
            self.x = list(zip([e['image_filename'] for e in data], q)) # [image_name, question]
            self.y = [classes.index(e['answer']) for e in data] # target classes
            self.batch_size = batch_size

        def __len__(self):
            return math.ceil(len(self.y) / self.batch_size)
        
        def __getitem__(self, idx):
            batch_x = self.x[idx*self.batch_size : (idx + 1)*self.batch_size]
            batch_y = self.y[idx*self.batch_size : (idx + 1)*self.batch_size]

            return [np.array([self.__imgtoarray__(e[0]) for e in batch_x]), np.array([e[1] for e in batch_x])], np.array(batch_y)
        
        def __imgtoarray__(self, img):
            im = Image.open('dataset_vqa/train/'+img).convert('RGB')
            np_im = np.array(im)
            #print(np_im.shape)
            return np_im/255.0
        
class VQASequenceTest(utils.Sequence):
        def __init__(self, data, q, batch_size):
            self.x = list(zip([e['image_filename'] for e in data], q)) # [image_name, question]
            self.y = [0 for e in data] # target classes
            self.batch_size = batch_size

        def __len__(self):
            return math.ceil(len(self.y) / self.batch_size)
        
        def __getitem__(self, idx):
            batch_x = self.x[idx*self.batch_size : (idx + 1)*self.batch_size]
            batch_y = self.y[idx*self.batch_size : (idx + 1)*self.batch_size]

            return [np.array([self.__imgtoarray__(e[0]) for e in batch_x]), np.array([e[1] for e in batch_x])], np.array(batch_y)
        
        def __on_epoch_end__(self):
            print('end')
        
        def __imgtoarray__(self, img):
            im = Image.open('dataset_vqa/test/'+img).convert('RGB')
            np_im = np.array(im)
            #print(np_im.shape)
            return np_im/255.0
            

In [8]:
vqa_generator_train = VQASequence(train_data, q_encoder_inputs_train, bs)
vqa_generator_valid = VQASequence(valid_data, q_encoder_inputs_valid, bs)
vqa_generator_test = VQASequenceTest(test_data, q_encoder_inputs_test, bs)
#print(vqa_generator_train.__getitem__(0))

In [9]:
def plotImages(question_arr, subset):
    num = len(question_arr)
    fig, axes = plt.subplots(num, 1, figsize=(30,30))
    axes = axes.flatten()
    for q, ax in zip(question_arr, axes):
        img = Image.open(dataset_dir + '/' + subset + '/' + q["image_filename"])
        ax.imshow(img)
        ax.set_title(q["question"])
        ax.axis('off')
  
    plt.show()

In [10]:
#plotImages(train_data[:2], 'train')
#plotImages(test_data[:2], 'test')

## Model

In [11]:
# CNN for image encoding

# Load VGG16 Model
vgg = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_h, img_w, 3))

finetuning = False

if finetuning:
    freeze_until = 15 # layer from which we want to fine-tune
    
    for layer in vgg.layers[:freeze_until]:
        layer.trainable = False
else:
    vgg.trainable = False

# Image encoding
image_model = models.Sequential()
image_model.add(vgg)
image_model.add(layers.Flatten())

image_input = layers.Input(shape=(img_h, img_w, 3))

encoded_image = image_model(image_input)
#encoded_image = image_model

In [12]:
# RNN for question encoding

question_input = layers.Input(shape=[max_q_length])
embedded_question = layers.Embedding(input_dim=len(q_wtoi)+1, output_dim=10, input_length=max_q_length)(question_input)
encoded_question = layers.LSTM(128)(embedded_question)

In [13]:
# Combine CNN and RNN to create the final model
merged = layers.concatenate([encoded_question, encoded_image])
output = layers.Dense(num_classes, activation='softmax')(merged)
vqa_model = models.Model(inputs=[image_input, question_input], outputs=output)

vqa_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 39)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 39, 10)       710         input_3[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 320, 480, 3) 0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 128)          71168       embedding[0][0]                  
______________________________________________________________________________________________

In [14]:
cwd = os.getcwd()

# Create a folder which will contain the result of all the run of the network
exps_dir = os.path.join(cwd, 'classification_experiments')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

model_name = 'CNN'

# Create a folder which will contain the result of callbacks of a singular execution
exp_dir = os.path.join(exps_dir, model_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    
callbacks = []

# Callback1 - Model checkpoint
ckpt = False

if ckpt:
    ckpt_dir = os.path.join(exp_dir, 'ckpts')
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                       save_weights_only=True) # False to save the model directly
    callbacks.append(ckpt_callback)

# Callback2 - Early Stopping
early_stop = True

if early_stop:
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    callbacks.append(es_callback)

In [15]:
# Optimization params
epoch_num = 100

# Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy()

# Optimazer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00005)

# Validation metrics
metrics = ['accuracy']

# Compile Model
vqa_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [16]:
history = vqa_model.fit_generator(generator = vqa_generator_train,
                                  callbacks=callbacks,
                                  epochs=epoch_num,
                                  steps_per_epoch=vqa_generator_train.__len__(),
                                  validation_data= vqa_generator_valid,
                                  validation_steps=vqa_generator_valid.__len__())

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


## Predict

In [17]:
from datetime import datetime

def create_csv(results, results_dir='./Test_Result'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [18]:
prediction = vqa_model.predict_generator(vqa_generator_test)

In [19]:
prediction_argmax = []

for e in prediction:
    prediction_argmax.append(np.argmax(e))

In [20]:
i=0

results = {}

while(i<len(prediction)):
    results[str(i)] = prediction_argmax[i]
    i+=1

create_csv(results)