In [1]:
import os
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
import datetime



# Set the seed for easy reproducibility
SEED = 1234
tf.random.set_seed(SEED)  

# Get current working directory
cwd = os.getcwd()

# Set GPU memory growth 
# Allows to only as much GPU memory as needed
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


For this challenge, a custom generator had to be built. Therefore, I proceeded to build a train/validation generator, which yielded a batch composed of random questions sampled from the training set with the corresponding images and answers. Before, though, I tokenize the words contained in the training questions and, then, I create my embedding matrix, which i will then use in my first embedding layer of the network, using a pre-made good embedding, in this case a GloVe embedding I found in Kaggle (https://www.kaggle.com/takuok/glove840b300dtxt), as in this way, I can use it directly in a Kaggle kernel just by adding the data into the kernel. Regarding the embedding dimensions, I didn't find the model to change much when changing from, say, 50 to 70, mainly because of the few number of words I have in the questions. 

In [2]:
import json
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

random.seed(1235)
img_h = 320
img_w = 480 
max_words = 100
embedding_dim = 70
num_classes = 13
val_split = 0.8
max_len = 25
    
classes = [ '0',
            '1',
            '10',
            '2',
            '3',
            '4',
            '5',
            '6',
            '7',
            '8',
            '9',
            'no',
            'yes'
          ]



label_encoder = LabelEncoder()
integer_encoder_ = label_encoder.fit(classes)
integer_encoded = integer_encoder_.transform(classes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder_ = onehot_encoder.fit(integer_encoded)


def data_generator(mode, tokenizer, batch_size = 12):

    
    with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
          data_raw = json.load(f)
    f.close()
    while True:
        # Select files (paths/indices) for the batch randomly

        if mode == 'validation':
            batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size)
        elif mode == 'train':
            batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size)
        else:
             batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size)
            
        batch_input_img = []
        batch_input_txt = []
        batch_output = [] 

        for i in batch_addresses:
            #get the image in the required format
            image_name = data_raw['questions'][i]['image_filename']
            img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB')
            img_array = np.array(img)
            img_array = np.expand_dims(img_array, 0)
            input_img = np.true_divide(img_array,255)

            input_txt = data_raw['questions'][i]['question']

            output = data_raw['questions'][i]['answer']

            batch_input_img += [ input_img ]
            batch_input_txt += [ input_txt ]
            batch_output += [ output ]
            
            # Return a tuple of (input,output) to feed the network
            batch_x_img = np.array( batch_input_img )
            batch_x_txt = np.array( batch_input_txt )

        batch_x_img = batch_x_img[:,-1]    
        # prepare sequences with tokens and padding
        tokenized = tokenizer.texts_to_sequences(batch_x_txt)
        batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) 
        
    
        batch_y = np.array( batch_output )
        y_c = integer_encoder_.transform(batch_y)
        y_c = y_c.reshape(len(y_c), 1)
        onehot_encoded = onehot_encoder_.transform(y_c)

        batch_y = onehot_encoded

        #batch_x_txt = np.expand_dims(batch_x_txt, axis=-1) Se uso solo LSTM senza Embedding

        yield ([batch_x_img,batch_x_txt], batch_y )
        
#same for the test generator apart from the fact that here we do not work with batches
def test_generator():
    
    
    with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
          data_raw = json.load(f)
    f.close()
    i = 0
    while (i<=len(data_raw['questions'])):

        batch_input_img = []
        batch_input_txt = []
        batch_output = [] 

        image_name = data_raw['questions'][i]['image_filename']
        img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB')
        img_array = np.array(img)
        img_array = np.expand_dims(img_array, 0)
        input_img = np.true_divide(img_array,255)

        input_txt = data_raw['questions'][i]['question']

        output = data_raw['questions'][i]['question_id']

        batch_input_img += [ input_img ]
        batch_input_txt += [ input_txt ]

        # Return a tuple of (input,output) to feed the network
        batch_x_img = np.array( batch_input_img )
        batch_x_txt = np.array( batch_input_txt )

        batch_x_img = batch_x_img[:,-1]    

        tokenized = tokenizer.texts_to_sequences(batch_x_txt)
        batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) 
        
        batch_y = output
        
        i+=1
        
        #batch_x_txt = np.expand_dims(batch_x_txt, axis=-1)



        yield ([batch_x_img,batch_x_txt], batch_y )
           

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    #I search in the embedding text file the words in order to build the embedding matrix
    with open(filepath) as f:
        count = 0
        for line in f:
            word, *vector = line.split()
            if word in word_index and count<(len(word_index)-1):
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
                count = count + 1


    return embedding_matrix

def create_tokens(tokenizer,mode = 0):
    # mode = 1 per avere la serie delle risposte, in modo da poterle analizzare (frequenza ecc)
    
    #i cycle through the training data in order to get all the needed words
    with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
          data_raw = json.load(f)
    f.close()
    tot_txt = []
    container = []
    for i in range(len(data_raw['questions'])):
        input_txt = data_raw['questions'][i]['question']
        tot_txt += [input_txt]
        if mode:
            container.append(data_raw['questions'][i]['answer'])  
        
    tokenizer.fit_on_texts(tot_txt)
    if mode:
        return (tokenizer,container)  
    else:
        return tokenizer 

def creator(max_words):
    tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV')
    tokenizer = create_tokens(tokenizer)
    filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0]

    embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim)
    vocab_size = len(tokenizer.word_index) + 1

    reader = data_generator('train',tokenizer)
    
    return tokenizer,embedding_matrix,vocab_size,reader

(tokenizer,embedding_matrix,vocab_size,reader) = creator(max_words)



In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


I looked at some indicators to see whether my embedding was good enough (as first I was using a lighter, much worse embedding), such as the cosine similarity between words I knew had similar or really different meaning.
I also saw that the dataset is really unbalanced. It might help to create a personalised loss in order to weigh more some results. 



In [None]:

#DIAGNOSTICS
"""
print(tokenizer.word_index)
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(embedding_matrix)
i=31
c = cos_sim[i]
np.argsort(c)
print(c[29])
"""
# l'embedding sembra buono dato che le parole simili hanno una cosine similarity più alta

#plt.hist(answers, density= True) #il dataset è molto sbilanciato (= yes or no, che rende bilanciata la classificazione, ma 85% di counting è <= 3 )
# 30 % delle domande sono di classificazione, un po' meno, anche se non tragico
#Questo che conseguenze ha sulle performance di un weak learner?








This is the first naive model I used. It basically consists in a image-embedding branch, which uses features extracted by VGG16 after a globablmaxpooling and question-embedding branch, which consists of a word embedding layer (with the obtained embedding matrix) and two bidirectional LSTM layers.
These two feature vectors are then concatenated and, after a dense layer, we have the classification dense layer, which outputs the result. This model works okay and gets to ~ 50% accuracy, without spending too much time tuning the hyperparameters. 


In [None]:

def train_net(epochs,batch_size):

    arch =  tf.keras.applications.vgg16.VGG16(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))

    freeze_until = 800
    for layer in arch.layers[:freeze_until]:
          layer.trainable = False
    branch1 = arch.output

    #branch1 = tf.keras.layers.Flatten() (branch1)
    branch1 = tf.keras.layers.GlobalMaxPooling2D() (branch1)
    #branch1 = tf.keras.layers.Dense(256, activation='tanh') (branch1)



    text_inputs = tf.keras.Input(shape=[max_len])

    
    #bidirectional to catch all the context, low dropout as it worsens the performance (like this the net already has too little power)
    emb = tf.keras.layers.Embedding(vocab_size,embedding_dim, 
                               input_length=max_words, 
                               weights=[embedding_matrix], 
                               trainable=False)  (text_inputs)

    branch2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True))(emb)
    branch2 = tf.keras.layers.Activation('tanh')(branch2)
    branch2 = tf.keras.layers.Dropout(0.2)(branch2)
    branch2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(branch2)
    branch2 = tf.keras.layers.Activation('tanh')(branch2)
    branch2 = tf.keras.layers.Dropout(0.2)(branch2)


    #straightforward concatenation
    joint = tf.keras.layers.concatenate([branch1, branch2])
    joint = tf.keras.layers.Dense(1024, activation='relu')(joint)
    joint = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True)(joint)
    joint = tf.keras.layers.Activation('relu')(joint)
    joint = tf.keras.layers.Dropout(0.1)(joint)

    predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint)

    model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])



    model.summary()

    loss = tf.keras.losses.CategoricalCrossentropy()
    lr = 5e-4
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model.compile(loss = loss,
                       optimizer = optimizer,
                       metrics = ['accuracy'])

    callbacks=[]
    callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 10,restore_best_weights=True))


    callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))

    try:
        history = model.fit_generator(data_generator('train',tokenizer),validation_data = data_generator('validation',tokenizer), steps_per_epoch= 200, validation_steps = 100, epochs=epochs, callbacks=callbacks,  verbose=1, workers=8, use_multiprocessing=True, max_queue_size=100)
        model.save('model.h5')
    except KeyboardInterrupt:
        model.save('model.h5')

    return (history,model)

epochs = 100
batch_size = 12
(history,model) = train_net(epochs,batch_size)




I then tried to better my model. I had two main ideas, the first being to implement attention and the second to develop a on/off switch based on the question type (existence vs counting) in order to train two different neural networks for the different tasks, using a regression loss for counting and a classification loss for existence. While this might work, I have no evidence and, in the end, as I wasn't sure, I decided not to try it, even because it would have meant simplifying the networks, as Kaggle kernels have a low constraint on RAM. For the future, I could still design a personalised fuzzy loss, in order to let the network see as a major error the misclassification of the question and have an error rate proportional to the difference from the correct count in counting, keeping in mind I have to let the loss be differentiable in order for backprop to work. Again, I don't know if it would work.
I, however, tried with attention, but even here I found to have a constraint on RAM. In fact, I thought of using a top-down attention module (similar to this one https://arxiv.org/pdf/1708.02711.pdf), but the k * n final features would probably be too much. In the end, I adapted https://github.com/adamcasson/show_ask_attend_answer, an implementation of the famous show-ask-attend paper in keras (I actually found the majority of vqa implemented in Pytorch (and some in pure TF)), which didn't really get great results, though. It is also to be noted that it's a poor attention mechanism and could be bettered in many ways.
Like this, without too much hp tuning, I got to ~ 53%.

In [4]:
import tensorflow as tf


def glimpse(attention_maps, image_features, num_glimpses=2, n1=10, n2=15):
    glimpse_list = []
    for i in range(num_glimpses):
        glimpse_map = tf.keras.layers.Lambda(lambda x: x[:,:,:,i])(attention_maps)                # Select the i'th attention map
        glimpse_map = tf.keras.layers.Reshape((n1,n2,1))(glimpse_map)                             # Reshape to add channel dimension for K.tile() to work. (14,14) --> (14,14,1)
        glimpse_tile = tf.keras.layers.Lambda(tile)(glimpse_map)                                  # Repeat the attention over the channel dimension. (14,14,1) --> (14,14,2048)
        weighted_features = tf.keras.layers.multiply([image_features, glimpse_tile])              # Element wise multiplication to weight image features
        weighted_average = tf.keras.layers.AveragePooling2D(pool_size=(n1,n2))(weighted_features) # Average pool each channel. (14,14,512) --> (1,1,512)
        weighted_average = tf.keras.layers.Flatten()(weighted_average)
        glimpse_list.append(weighted_average)
        
    return tf.keras.layers.concatenate(glimpse_list)

def tile(x):
    return tf.keras.backend.tile(x, [1,1,1,n_f])

def train_net(epochs,batch_size,n1,n2,dropout):



    #Troppo grosso layer finale per inceptionresnet
    #arch =  tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3)) #1920
    #arch = tf.keras.applications.resnet_v2.ResNet152V2(include_top=False, weights='imagenet',input_shape=(img_h, img_w, 3)) #2048
    arch =  tf.keras.applications.vgg16.VGG16(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3)) #512

    for layer in arch.layers[:800]:
        layer.trainable = False
        
    branch1 = arch.output



    text_inputs = tf.keras.Input(shape=[max_len])

    #simple embedding
    emb = tf.keras.layers.Embedding(vocab_size,embedding_dim, 
                               input_length=max_words, 
                               weights=[embedding_matrix], 
                               trainable=False)  (text_inputs)

    branch2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(156,return_sequences=True))(emb)
    branch2 = tf.keras.layers.Activation('tanh')(branch2)
    branch2 = tf.keras.layers.Dropout(dropout)(branch2)
    branch2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(branch2)
    branch2 = tf.keras.layers.Activation('tanh')(branch2)
    branch2 = tf.keras.layers.Dropout(dropout)(branch2)
    
    #i need to reshape it in order to use keras tensor operations
    question_tile = tf.keras.layers.RepeatVector(n1*n2)(branch2)
    question_tile = tf.keras.layers.Reshape((n1,n2,int(n_f/2)))(question_tile)

    joint = tf.keras.layers.concatenate([branch1, question_tile])
    
    #here i'm actually building the attention maps by using the image and the text
    attention = tf.keras.layers.Conv2D(n_f,(1,1))(joint)
    attention_relu = tf.keras.layers.Activation('relu')(attention)
    attention_relu = tf.keras.layers.Dropout(dropout)(attention_relu)
    
    attention_conv = tf.keras.layers.Conv2D(n_gli, (1,1))(attention_relu)
    attention_maps = tf.keras.layers.Activation('softmax')(attention_conv)
    
    image_attention = glimpse(attention_maps, branch1, n_gli, n1,n2)
    
    #here I've tried a different approach in the end, by multiplying the text features for attention. While it makes sense in general,
    #here I've had to take into account the different dimensions and therefore to multiply the attention maps for a concatenation of
    #4 text feature vectors so I'm not really sure it makes sense, but it is the one which works best (commented you have the previous version )
    
    branch2_n = tf.keras.layers.RepeatVector(n_gli*2)(branch2)
    branch2_n = tf.keras.layers.Flatten() (branch2_n)
    joint2 = tf.keras.layers.multiply([image_attention,branch2_n])
    
    #joint2 = tf.keras.layers.concatenate([image_attention,branch2])
    #joint2 = tf.keras.layers.Dropout(dropout)(joint2)
    

    joint2 = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True)(joint2)
    
    #dense layer in the end 
    joint_fc = tf.keras.layers.Dense(1024)(joint2)
    joint_fc = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True)(joint_fc)
    joint_fc = tf.keras.layers.Activation('relu')(joint_fc)

    predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint_fc)

    model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])



    model.summary()

    loss = tf.keras.losses.CategoricalCrossentropy()
    lr = 5e-4
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model.compile(loss = loss,
                       optimizer = optimizer,
                       metrics = ['accuracy'])
    
    #as i soon go OOM, I can easily restart training if needed, having saved the weights with a callback
    filepath = "../input/weightsnn/" + os.listdir("../input/weightsnn/")[0]
    model.load_weights(filepath)

    callbacks=[]
    callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 10,restore_best_weights=True))

    callbacks.append(tf.keras.callbacks.ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1))

    callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))
    
    #return (model,2) (if I just have to return the model)
    try:
        history = model.fit_generator(data_generator('train',tokenizer,batch_size),validation_data = data_generator('validation',tokenizer,batch_size), steps_per_epoch = 200, validation_steps = 80, epochs=epochs, callbacks=callbacks,  verbose=1, workers=8, use_multiprocessing=True, max_queue_size=100)
        model.save('model.h5')
        return (model,history)
    except KeyboardInterrupt:
        model.save('model.h5')
        return (model,2)
    

epochs = 100
batch_size = 16
n1 = 10
n2 = 15
n_f = 512 # it has to be the number of feature maps for last layer of the cnn network
n_gli = 2 # number of glimpses (more glimpses = more time + more imbalance in last layer + possible ram error)
dropout = 0.1
(model,history) = train_net(epochs,batch_size,n1,n2,dropout)



Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 320, 480, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 320, 480, 64) 1792        input_3[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 320, 480, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 160, 240, 64) 0           block1_conv2[0][0]               
____________________________________________________________________________________________

In [6]:
#tf.keras.utils.plot_model(model)
#filepath = "../input/weightsnn/" + os.listdir("../input/weightsnn/")[0]
#model.load_weights(filepath)


Here, I get the predictions using my test generator.

In [None]:
test_gen = test_generator()
results = {}
count = 0 

while 1:
    inputs, outputs = next(test_gen)
    pred = model.predict(inputs)
    results[outputs]=np.argmax(pred)
    print(count)
    count = count +1 
    



This snippet was useful in order to perform diagnostics for the model, mainly in the early stages, when a fault in the training generator, made the training useless

In [None]:
#DIAGNOSTICS

with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
          test_data = json.load(f)
f.close()

test_gen = test_generator()
train_gen = data_generator('train',tokenizer)
count = 0

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)


while count<10:
    inputs, outputs = next(train_gen,tokenizer)
    #print(test_data['questions'][count]['question'])
    #print(inputs[1])
    print(list(map(sequence_to_text, inputs[1])))
    print(outputs)

"""
while count<5:
    inputs, outputs = next(test_gen)
    pred = model.predict(inputs)
    print(test_data['questions'][count]['question'])
    #print(inputs[1])
    print(list(map(sequence_to_text, inputs[1])))
    count = count +1 
"""

In [None]:
#tf.keras.utils.plot_model(model, to_file='model.png')


plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')

In [None]:
create_csv(results)

In [None]:
#plt.hist(results.values(), density= True)
results.values()
