# Challenge 3 ANNDL: Image Question - Answering

### Approach

Base: the segmentation exercise from the exercise session of ANNDL
Improvement: First some small changes (as the leraning rate, tried some other loss function, add my_IoU). Then i explored some well-known architectures as U-net (by hand) and other using a library (linked above). Then the transfer learning method gave me the better result (i used imagenet weights, with vgg16 and then xception classifier), but the worst performance above the ather methods.

### Directory structure

- Segmentation_Dataset/
    - training/
        - images/
            - img/
                - img1, img2, …, imgN
        - masks/
            - img/
                - mask1, mask2, ... , maskN
    - test/
        - images/
            - img/
                - img1, img2, …, imgN

### Importing Libraries

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os

from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dropout, Lambda
from tensorflow.keras.layers import Conv2D, Conv2DTranspose
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer

import pandas as pddef 
from cv2 import imread
from tensorflow.keras.utils import to_categorical

# Check the GPU
'''
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)
'''

SEED = 1234
tf.random.set_seed(SEED)  
cwd = os.getcwd()

## Load files

Each question is a dictionary as the following:

{
 - 'question': ...,
 - 'image_filename': ..., 
 - 'answer': ...
 
}

where 'question' is a sentence, e.g., 'How many red objects?', 'image_filename', is the filename of the image the question is referring to, 'answer' is the ground truth (one of {'0', '1', '10', ..., 'no', 'yes'}).

Test questions have an additional key that is a 'question_id' to uniquely identify your solution when submitting.


In [2]:
dataset_dir = os.path.join(cwd, 'dataset_vqa')

# Batch size
bs = 128

# img shape
img_h = 320
img_w = 480

num_classes = 13

classes = {'0': 0,
    '1': 1,
    '10': 2,
    '2': 3,
    '3': 4,
    '4': 5,
    '5': 6,
    '6': 7,
    '7': 8,
    '8': 9,
    '9': 10,
    'no': 11,
    'yes': 12}

### Custom generator

In [3]:
import os
import json

train_path = '/Git/tensorflow_exercises/ANNDL_challenges/3/dataset_vqa/train/images'
test_path = '/Git/tensorflow_exercises/ANNDL_challenges/3/dataset_vqa/test/images'

d=[]
train_dict=[]
test_dict=[]
valid_dict = []
x = 0
ind = 0
all_questions = []

# Read the json into Dictionaries
with open('/Git/tensorflow_exercises/ANNDL_challenges/3/dataset_vqa/test_data.json', 'r') as f:
      test_data = json.load(f)
f.close()

with open('/Git/tensorflow_exercises/ANNDL_challenges/3/dataset_vqa/train_data.json', 'r') as f:
      train_data = json.load(f)
f.close()

for key in train_data["questions"]:
    all_questions.append(key['question'])
    d.append(x)
    x += 1
    
for key in test_data["questions"]:
    all_questions.append(key['question'])
    test_dict.append(ind)
    ind += 1
    
valid_dict = d[int(len(d)*0.8):]
train_dict = d[:int(len(d)*0.8)]


In [4]:
# Fit tokenizer on the training questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_questions)
vocab_size = len(tokenizer.word_index) + 1  # We add one because the Keras Tokenizer reserves index 0 and never uses it.

def string_to_BOW(train_questions):
    # Convert questions to BOW
    train_X_seqs = tokenizer.texts_to_matrix(train_questions)
    return train_X_seqs

### Prepare TEST data

In [None]:

file_list = []
def get_train(id_number):
    
    image_info = test_data["questions"][id_number]
    image_name = image_info["image_filename"]
    img = imread(test_path + "/" + image_name)
    img_question = image_info["question"]
    return img, img_question


def test_generator(files):
    i=0
    while True:         
        batch_input = []
        batch_question = []
        
        input_path = files[i]
        i+=1
        
        file_list.append(input_path)
        input_vqa, input_question_vqa = get_train(input_path)
        input_vqa = input_vqa / 255
        batch_input.append(input_vqa)
        batch_question.append(input_question_vqa)

        batch_question_encoded = string_to_BOW(batch_question)  
        
        np_batch_input = np.array(batch_input)
        np_batch_question = np.array(batch_question_encoded, dtype=np.uint8)
        
        yield [np_batch_input, np_batch_question]

In [None]:
def get_input(id_number):
    image_info = train_data["questions"][id_number]
    image_name = image_info["image_filename"]
    img = imread(train_path + "/" + image_name)
    img_question = image_info["question"]
    
    img_answer = image_info["answer"]
    img_answer = classes[img_answer]
    return img, img_question, img_answer

def image_generator(files, batch_size = bs):

        
    while True:
        batch_input = []
        batch_answer = []
        batch_question = []
        batch_paths = np.random.choice(a = files, 
                                        size = batch_size,
                                        replace=False)
        
        # Read in each input, perform preprocessing and get labels          
        for input_path in batch_paths: 
            input_vqa, input_question_vqa, answer_vqa = get_input(input_path)
            input_vqa = input_vqa / 255
            batch_input.append(input_vqa)
            batch_question.append(input_question_vqa)
            batch_answer.append(answer_vqa) 

        batch_question_encoded = string_to_BOW(batch_question)      

        np_batch_question = np.array(batch_question_encoded, dtype=np.uint8)

        
        yield [batch_input, np_batch_question], batch_answer

### Actual generator (+ tests)

In [None]:
train_gen = image_generator(train_dict)
valid_gen = image_generator(valid_dict)
test_gen = test_generator(test_dict)
"""
iterator=iter(test_gen)
inputs, answers = next(iterator)

print(inputs[1][1])
print(inputs[0][1])
print(answers)
print(file_list)"""

'\niterator=iter(test_gen)\ninputs, answers = next(iterator)\n\nprint(inputs[1][1])\nprint(inputs[0][1])\nprint(answers)\nprint(file_list)'

In [None]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Multiply
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model, Sequential

# The CNN
im_input = Input(shape=(img_h, img_w, 3), dtype='float32')
x1 = Conv2D(8, 3, padding='same')(im_input)
x1 = MaxPooling2D()(x1)
x1 = Conv2D(16, 3, padding='same')(x1)
x1 = MaxPooling2D()(x1)
x1 = Flatten()(x1)
# Add a final fully-connected layer after the CNN for good measure
x1 = Dense(32, activation='tanh')(x1)

# The question network
q_input = Input(shape=(vocab_size,), dtype='uint8')
x2 = Dense(32, activation='tanh')(q_input)
x2 = Dense(32, activation='tanh')(x2)

# Combine CNN and RNN to create the final model
out = Multiply()([x1, x2])
out = Dense(32, activation='tanh')(out)
out = Dense(13, activation='softmax')(out)

vqa_model = Model(inputs=[im_input, q_input], outputs=out)
vqa_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 320, 480, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 320, 480, 8)  224         input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d (MaxPooling2D)    (None, 160, 240, 8)  0           conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 160, 240, 16) 1168        max_pooling2d[0][0]              
______________________________________________________________________________________________

In [None]:
%%script false 
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Define CNN for Image Input
vision_model = Sequential()
vision_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(img_h, img_w, 3)))
vision_model.add(Conv2D(64, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
vision_model.add(Conv2D(128, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
vision_model.add(Conv2D(256, (3, 3), activation='relu'))
vision_model.add(Conv2D(256, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Flatten())

image_input = Input(shape=(img_h, img_w, 3))
encoded_image = vision_model(image_input)

# Define RNN for language input
question_input = Input(shape=(vocab_size,) , dtype='uint8')
embedded_question = Embedding(input_dim=10000, output_dim=1, input_length=vocab_size)(question_input)
encoded_question = LSTM(256)(embedded_question)

# Combine CNN and RNN to create the final model
merged = tf.keras.layers.concatenate([encoded_question, encoded_image])
output = Dense(1000, activation='softmax')(merged)
vqa_model = Model(inputs=[image_input, question_input], outputs=output)

Couldn't find program: 'false'


In [None]:
#loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
loss = 'sparse_categorical_crossentropy'
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# metrics = ['accuracy']
metrics = ['sparse_categorical_accuracy']

vqa_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
# Include the epoch in the file name (uses `str.format`)
checkpoint_path = "challenge_3/{epoch:02d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')

# 207594
# 51899
vqa_model.fit_generator(train_gen, 
              steps_per_epoch= int(207594/bs), 
              epochs=10, 
              verbose=1, 
              callbacks=None, 
              validation_data= valid_gen, 
              validation_steps= int(51899/bs), 
              validation_freq=1, 
              class_weight=None, 
              max_queue_size=10, 
              workers=1, 
              use_multiprocessing=False, 
              shuffle=False, 
              initial_epoch=0)

Epoch 1/10
 113/1621 [=>............................] - ETA: 1:00:06 - loss: 1.5870 - sparse_categorical_accuracy: 0.3551

## Inference

In [None]:
prediction = vqa_model.predict_generator(test_gen,
                                        verbose=1,
                                        steps=3000)

## Create the submission file

In [None]:
prediction.shape
file_list.sort()

In [None]:
import os
from datetime import datetime

results_dict = {}

for i in range(3000):
    results_dict[file_list[i]] = prediction[i].argmax()
print(len(results_dict))
    
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')
            
create_csv(results_dict)