In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import time

import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
# Feature 
!gdown https://drive.google.com/uc?id=1qL6LnPlDBv9M0W0GZLu0_745UVE_uqXI
!gdown https://drive.google.com/uc?id=13Okseytkvh5VRQm1cRcjJ0inouMFHS1i

Downloading...
From: https://drive.google.com/uc?id=1qL6LnPlDBv9M0W0GZLu0_745UVE_uqXI
To: /content/ImagesFeatures.zip
1.26GB [00:13, 92.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=13Okseytkvh5VRQm1cRcjJ0inouMFHS1i
To: /content/ImagesFeaturesVAL.zip
635MB [00:18, 34.7MB/s]


In [None]:
!mkdir '/content/Features'

In [None]:
!unzip ImagesFeatures.zip -d /content/Features/train && rm ImagesFeatures.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547307.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547308.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547315.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547318.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547348.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547351.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547352.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547363.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547367.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_000000547369.npy  
  inflating: /content/Features/train/ImagesFeatures/COCO_train2014_00000054

In [None]:
!unzip ImagesFeaturesVAL.zip -d /content/Features/val && rm ImagesFeaturesVAL.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163611.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163798.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163775.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163679.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163684.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163640.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163728.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163759.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163782.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000016377.npy  
  inflating: /content/Features/val/ImagesFeaturesVAL/COCO_val2014_000000163628.npy  


In [None]:
INPUT_DIR = '/content/drive/MyDrive/VQA_preprocessed'

In [None]:
class vocab:

  def __init__(self, vocab_file):
    self.vocab = self.load_vocab(vocab_file)
    self.vocab2idx = {word: idx for idx, word in enumerate(self.vocab)}
    self.vocab_size = len(self.vocab)

  def load_vocab(self, vocab_file):
    with open(vocab_file, 'r') as f:
      vocab = [line.strip() for line in f]
    return vocab

  def word2idx(self, word):
    if word in self.vocab2idx:
      return self.vocab2idx[word]
    else:
      return self.vocab2idx['<unk>']
  
  def idx2word(self, idx):
    return self.vocab[idx]

In [None]:
question_vocab_dir = os.path.join(INPUT_DIR, 'preprocessed/Questions/question_vocabs.txt')
question_vocab = vocab(question_vocab_dir)

answer_vocab_dir = os.path.join(INPUT_DIR, 'preprocessed/Annotations/annotation_vocabs.txt')
answer_vocab = vocab(answer_vocab_dir)

In [None]:
def load_features(features_path):
  return np.load(features_path, allow_pickle=True)

In [None]:
max_qu_length = 30

In [None]:
def tokenizer(sentence):

    regex = re.compile(r'(\W+)')
    tokens = regex.split(sentence.lower())
    tokens = [w.strip() for w in tokens if len(w.strip()) > 0]
    return tokens[:-1]

In [None]:
def load_question(question):
  qu_tokens = tokenizer(question)
  qu2idx = np.full(max_qu_length, question_vocab.word2idx('<pad>'))
  qu2idx[:len(qu_tokens)] = [question_vocab.word2idx(token) for token in qu_tokens]
  return qu2idx

In [None]:
def load_answer(answer):
  answer_idx = answer_vocab.word2idx(answer)
  answer = np.zeros(answer_vocab.vocab_size)
  answer[answer_idx] = 1
  return answer

In [None]:
def preprocess(features_path, question, answer):
  features_path = features_path.numpy().decode('utf-8')
  question = question.numpy().decode('utf-8')
  answer = answer.numpy().decode('utf-8')

  features = load_features(features_path)
  question_vector = load_question(question)
  answer_vector = load_answer(answer)
  
  return (features, question_vector, answer_vector)

In [None]:
def get_tensors_ready(x,y,z):
  x.set_shape((49,512))
  y.set_shape((30,))
  z.set_shape((1000,))
  return ((x,y),z)

In [None]:
def build_dataset(file_name):
  data_dir = os.path.join(INPUT_DIR, file_name)
  data = np.load(data_dir, allow_pickle=True).tolist()
  if 'train' in file_name:
    features_path = '/content/Features/train/ImagesFeatures'
  elif 'val' in file_name:
    features_path = '/content/Features/val/ImagesFeaturesVAL'
  features = []
  questions = []
  answers = []
  for element in data:
    features.append(os.path.join(features_path, element[0][:-3] + 'npy'))
    questions.append(element[1])
    answers.append(element[2])

  dataset = tf.data.Dataset.from_tensor_slices((features, questions, answers))
  BATCH_SIZE = 128
  dataset = dataset.cache()
  dataset = dataset.map(lambda x, y, z: tf.py_function(func=preprocess,inp=[x, y, z], Tout=(tf.float32,tf.int32,tf.int32)), num_parallel_calls=tf.data.AUTOTUNE)
  dataset = dataset.map(get_tensors_ready)
  dataset = dataset.batch(BATCH_SIZE)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

In [None]:
embedding_dict = {}
average_vec = np.zeros(300, "float32")
n = 0
with open("/content/drive/MyDrive/glove.6B.300d.txt", 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1: ], "float32")
    embedding_dict[word] = vector
    average_vec += vector
    n += 1

vocab_size = question_vocab.vocab_size
embedding_matrix = np.zeros((vocab_size, 300))

for i, word in enumerate(question_vocab.vocab):
  if i < vocab_size:
    emb_vector = embedding_dict.get(word)
    if emb_vector is not None:
      embedding_matrix[i] = emb_vector  

average_vec = average_vec / n
embedding_matrix[1] = average_vec   #giving unkown words the average value of the words embeddings

In [None]:
#image model
im_input = tf.keras.layers.Input(shape=(49, 512))
x1 = tf.keras.layers.Flatten()(im_input)
x1 = tf.keras.layers.Dense(1024, activation='tanh')(x1)

#question model
vocab_size = question_vocab.vocab_size
q_input = tf.keras.layers.Input(shape=max_qu_length)
x2 = tf.keras.layers.Embedding(input_dim=vocab_size,
                               output_dim=300,
                               input_length=max_qu_length,
                               embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                               trainable=True)(q_input)
#x2 = tf.keras.layers.SimpleRNN(2048, return_sequences=True)(x2)
_, state_h, state_c = tf.keras.layers.LSTM(512, return_state=True)(x2)
x2 = tf.keras.layers.concatenate([state_h, state_c])

#combine features
out = tf.keras.layers.Multiply()([x1, x2])

#model output
num_answers = answer_vocab.vocab_size
out = tf.keras.layers.Dense(num_answers, activation='tanh')(out)
out = tf.keras.layers.Dropout(0.5)(out)
out = tf.keras.layers.Dense(num_answers, activation='tanh')(out)
out = tf.keras.layers.Dropout(0.5)(out)
out = tf.keras.layers.Dense(num_answers, activation='softmax')(out)

#model specs
model = tf.keras.Model(inputs=[im_input, q_input], outputs=[out])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
               metrics=['accuracy'])


In [None]:
model_checkpoint_path = "/content/checkpoints_features/checkpoint-{epoch:02d}.h5"
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path)

training_history_path = "/content/checkpoints_features/history.log"
history_callback = tf.keras.callbacks.CSVLogger(training_history_path, append=True)

In [None]:
train_dataset = build_dataset('train.npy')
val_dataset = build_dataset('val.npy')
history = model.fit(train_dataset,
                    epochs=15,
                    shuffle=True,
                    validation_data = val_dataset,
                    callbacks = [model_checkpoint, history_callback])

Epoch 1/15




Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
data = np.load('/content/drive/MyDrive/VQA_preprocessed/train.npy', allow_pickle=True).tolist()

In [None]:
img_name = data[0][0]
question = data[0][1]
ans = data[0][2]

In [None]:
print(question, ans)

What is this photo taken looking through? net


In [None]:
question = load_question(question)
ans = load_answer(ans)
img = np.load(os.path.join('/content/Features/train/ImagesFeatures', img_name[:-3] + 'npy'), allow_pickle=True)

In [None]:
img =  tf.expand_dims(img, axis=0)
question = tf.expand_dims(question, axis=0)
ans = tf.argmax(model.predict((img , question)), axis = 1)

In [None]:
print(answer_vocab.idx2word(ans[0]))

net
