In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np

In [None]:
img_model = tf.keras.applications.vgg16.VGG16(weights="imagenet", include_top=False)
vqa_model = tf.keras.models.load_model('/content/drive/MyDrive/checkpoints_features/checkpoint-04.h5')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
img_input = tf.keras.layers.Input(shape=(224,224,3,))
image_features = img_model(img_input)
image_features_reshpaed = tf.keras.layers.Reshape((49, 512, ))(image_features)

question_input = tf.keras.layers.Input(shape=(30,), dtype=tf.int64)
output = vqa_model([image_features_reshpaed, question_input])

In [None]:
full_model = tf.keras.models.Model(inputs=[img_input, question_input], outputs=[output])

Testing the full model:

In [None]:
import os
import re

In [None]:
INPUT_DIR = '/content/drive/MyDrive/VQA_preprocessed'

In [None]:
class vocab:

  def __init__(self, vocab_file):
    self.vocab = self.load_vocab(vocab_file)
    self.vocab2idx = {word: idx for idx, word in enumerate(self.vocab)}
    self.vocab_size = len(self.vocab)

  def load_vocab(self, vocab_file):
    with open(vocab_file, 'r') as f:
      vocab = [line.strip() for line in f]
    return vocab

  def word2idx(self, word):
    if word in self.vocab2idx:
      return self.vocab2idx[word]
    else:
      return self.vocab2idx['<unk>']
  
  def idx2word(self, idx):
    return self.vocab[idx]

In [None]:
question_vocab_dir = os.path.join(INPUT_DIR, 'preprocessed/Questions/question_vocabs.txt')
question_vocab = vocab(question_vocab_dir)

answer_vocab_dir = os.path.join(INPUT_DIR, 'preprocessed/Annotations/annotation_vocabs.txt')
answer_vocab = vocab(answer_vocab_dir)

In [None]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    img = tf.keras.applications.vgg16.preprocess_input(img)
    return img

In [None]:
max_qu_length = 30

In [None]:
def tokenizer(sentence):

    regex = re.compile(r'(\W+)')
    tokens = regex.split(sentence.lower())
    tokens = [w.strip() for w in tokens if len(w.strip()) > 0]
    return tokens[:-1]

In [None]:
def load_question(question):
  qu_tokens = tokenizer(question)
  qu2idx = np.full(max_qu_length, question_vocab.word2idx('<pad>'))
  qu2idx[:len(qu_tokens)] = [question_vocab.word2idx(token) for token in qu_tokens]
  return qu2idx

In [None]:
load_image('./pic4.jpg')

<tf.Tensor: shape=(224, 224, 3), dtype=float32, numpy=
array([[[ 1.41051392e+02,  1.35350708e+02,  1.29035065e+02],
        [ 1.44427063e+02,  1.37989410e+02,  1.27887505e+02],
        [ 1.48060333e+02,  1.37903687e+02,  1.27685402e+02],
        ...,
        [ 1.49432739e+02,  1.37631042e+02,  1.29423889e+02],
        [ 1.46499634e+02,  1.37833801e+02,  1.30295502e+02],
        [ 1.46929840e+02,  1.37612183e+02,  1.30711182e+02]],

       [[ 4.37697220e+01,  4.26579514e+01,  3.42346268e+01],
        [-4.65586853e+00, -4.56506348e+00, -1.68081665e+01],
        [-2.33925323e+01, -2.11152420e+01, -3.49537430e+01],
        ...,
        [ 1.54806061e+01,  1.87653580e+01,  9.42188263e+00],
        [ 1.12306252e+02,  1.04667519e+02,  9.71599808e+01],
        [ 1.44899719e+02,  1.36000031e+02,  1.28681061e+02]],

       [[ 9.64055634e+00,  1.69143906e+01,  7.25223541e+00],
        [-9.02413177e+00,  3.75221252e-01, -1.45017395e+01],
        [-3.20060501e+01, -2.26164474e+01, -3.53215561e+01],


In [None]:
img = tf.io.read_file('./pic.jpg')
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, (224, 224))
tf.keras.preprocessing.image.save_img('./pic_resized.jpg', img)

In [None]:
pic = tf.io.read_file('./pic.jpg')
pic = tf.image.decode_jpeg(pic, channels=3)
pic = tf.image.resize(pic, (224, 224))

In [None]:
pic[1][0]

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([241.01562, 242.01562, 237.01562], dtype=float32)>

In [None]:
pic_preprocessed = tf.zeros((224, 224, 3), tf.float32).numpy()
pic_preprocessed[:, :, 0] = -103.94
pic_preprocessed[:, :, 1] = -117.78
pic_preprocessed[:, :, 2] = 131.32
pic_preprocessed = tf.constant(pic_preprocessed, tf.float32, pic_preprocessed.shape)

In [None]:
pic_preprocessed[0][0]

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-103.94, -117.78,  131.32], dtype=float32)>

In [None]:
pic = load_image('./pic4.jpg')
que = "What color is the cat ?"
que = load_question(que)

In [None]:
pic[0][0]

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([141.05139, 135.35071, 129.03506], dtype=float32)>

In [None]:
pic =  tf.expand_dims(pic, axis=0)
que = tf.expand_dims(que, axis=0)

In [None]:
ans = full_model([pic, que])
answers = tf.math.top_k(ans, 5)
for i in (answers.indices[0].numpy().tolist()):
  print(answer_vocab.idx2word(i), ans[0][i].numpy() * 100)

white 43.44096779823303
black 30.361929535865784
brown 15.473364293575287
gray 3.9943531155586243
black and white 0.8459299802780151


In [None]:
pic[0][0][0]

<tf.Tensor: shape=(), dtype=float32, numpy=135.061>

In [None]:
pic =  tf.expand_dims(pic, axis=0)
que = tf.expand_dims(que, axis=0)

In [None]:
pic_[0][0][0].numpy()

254

In [None]:
tf.keras.preprocessing.image.save_img('./pic_preprocessed.jpg', pic)  #wrong doesn't support unsigned values

In [None]:
img = tf.io.read_file('./pic_preprocessed.jpg')
img = tf.image.decode_jpeg(img, channels=3)

In [None]:
que = "What color is the shirt ?"
que = load_question(que)

In [None]:
img =  tf.expand_dims(img, axis=0)
que = tf.expand_dims(que, axis=0)

In [None]:
ans = full_model([img, que])

In [None]:
ans

<tf.Tensor: shape=(1, 1000), dtype=float32, numpy=
array([[3.92575329e-03, 3.46922243e-05, 4.53988614e-05, 4.55336853e-08,
        1.88580742e-07, 4.06763852e-01, 3.81341749e-08, 3.27290922e-01,
        3.13960239e-02, 4.32271883e-03, 3.06016204e-07, 1.69157115e-08,
        1.07184807e-02, 1.36619464e-01, 2.26916187e-02, 3.62405939e-10,
        1.82436202e-02, 1.36607832e-06, 1.12206642e-07, 1.57531375e-08,
        2.09592429e-07, 5.06976416e-09, 3.52363916e-09, 4.87322249e-12,
        3.61196660e-02, 2.08684651e-04, 1.12874904e-10, 8.67669894e-11,
        2.13228879e-09, 6.82983606e-04, 7.34174832e-10, 3.52953465e-11,
        2.67377249e-11, 5.49751966e-10, 3.36674191e-08, 1.87062983e-06,
        1.09121608e-10, 6.38935260e-09, 4.17709884e-08, 9.42770514e-11,
        3.64792814e-11, 2.75157454e-05, 4.27518332e-09, 4.49542096e-12,
        4.19946076e-07, 3.39565133e-07, 3.48203272e-11, 1.58428548e-10,
        1.26536117e-11, 4.06453552e-07, 4.31839356e-11, 3.00799441e-10,
        8.303

In [None]:
ans2 = tf.argmax(ans, axis = 1)

In [None]:
ans[0][5]

<tf.Tensor: shape=(), dtype=float32, numpy=0.40676385>

In [None]:
pic = load_image('./pic.jpg')
que = "What color is the shirt ?"
que = load_question(que)

In [None]:
que

array([17342,  3332,  8292, 15834, 13890,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])

In [None]:
pic =  tf.expand_dims(pic, axis=0)
que = tf.expand_dims(que, axis=0)

In [None]:
ans = full_model([pic, que])

In [None]:
ans = tf.argmax(ans, axis = 1)

In [None]:
print(answer_vocab.idx2word(ans[0]))

red


In [None]:
tf.saved_model.save(full_model, '/content/VQA/1/')




FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: /content/VQA/1/assets


INFO:tensorflow:Assets written to: /content/VQA/1/assets


Testing the saved model to ensure that it's working well.

In [None]:
loaded = tf.saved_model.load('/content/VQA/1/')
print(list(loaded.signatures.keys()))

['serving_default']


In [None]:
infer = loaded.signatures["serving_default"]
print(infer.structured_outputs)

{'model': TensorSpec(shape=(None, 1000), dtype=tf.float32, name='model')}


In [None]:
full_model.output_names

['model']

In [None]:
infer.structured_input_signature

((),
 {'input_3': TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name='input_3'),
  'input_4': TensorSpec(shape=(None, 30), dtype=tf.int64, name='input_4')})

In [None]:
labeling = infer(input_3=tf.constant(pic), input_4=tf.constant(que))[full_model.output_names[0]]

In [None]:
saved_model_ans = tf.argmax(labeling, axis = 1)

In [None]:
print(answer_vocab.idx2word(saved_model_ans[0]))

red


Convert the model to tflite format:

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model('/content/VQA/1/') # path to the SavedModel directory
tflite_model = converter.convert()

# Save the model.
with open('VQA_model.tflite', 'wb') as f:
  f.write(tflite_model)

Testing the tflite model:

In [None]:
interpreter = tf.lite.Interpreter(model_path="VQA_model.tflite")
interpreter.allocate_tensors()

In [None]:
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [None]:
input_details

[{'dtype': numpy.int64,
  'index': 0,
  'name': 'serving_default_input_3:0',
  'quantization': (0.0, 0),
  'quantization_parameters': {'quantized_dimension': 0,
   'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32)},
  'shape': array([ 1, 30], dtype=int32),
  'shape_signature': array([-1, 30], dtype=int32),
  'sparsity_parameters': {}},
 {'dtype': numpy.float32,
  'index': 1,
  'name': 'serving_default_input_2:0',
  'quantization': (0.0, 0),
  'quantization_parameters': {'quantized_dimension': 0,
   'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32)},
  'shape': array([  1, 224, 224,   3], dtype=int32),
  'shape_signature': array([ -1, 224, 224,   3], dtype=int32),
  'sparsity_parameters': {}}]

In [None]:
output_details

[{'dtype': numpy.float32,
  'index': 83,
  'name': 'StatefulPartitionedCall:0',
  'quantization': (0.0, 0),
  'quantization_parameters': {'quantized_dimension': 0,
   'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32)},
  'shape': array([   1, 1000], dtype=int32),
  'shape_signature': array([  -1, 1000], dtype=int32),
  'sparsity_parameters': {}}]

In [None]:
input_details[0]['shape']

array([ 1, 30], dtype=int32)

In [None]:
input1_shape = input_details[0]['shape']
input2_shape = input_details[1]['shape']
interpreter.set_tensor(input_details[0]['index'], que)
interpreter.set_tensor(input_details[1]['index'], pic)

In [None]:
interpreter.invoke()

In [None]:
tflite_results = interpreter.get_tensor(output_details[0]['index'])

In [None]:
answers = tf.math.top_k(tflite_results, 5)
for i in (answers.indices[0].numpy().tolist()):
  print(answer_vocab.idx2word(i), tflite_results[0][i] * 100)

white 43.440988659858704
black 30.361944437026978
brown 15.473313629627228
gray 3.994360566139221
black and white 0.845930352807045


In [None]:
print(output_data)

[[3.58787226e-03 3.32682635e-06 2.79662368e-06 1.24325743e-05
  5.85925000e-06 4.34409887e-01 1.23588256e-06 5.78067498e-03
  6.11761073e-03 3.03619444e-01 8.77866387e-06 5.95234553e-07
  1.54733136e-01 3.10215307e-03 4.20090882e-03 1.02898332e-06
  3.99436057e-02 1.86258811e-04 9.34550314e-07 5.45523449e-07
  5.48611097e-05 2.05279957e-05 4.66031088e-05 4.14282937e-08
  7.43194064e-03 5.58187626e-03 2.37372615e-06 4.02665776e-08
  2.78134194e-05 3.14639835e-03 6.51559458e-05 1.28967855e-07
  5.29214397e-08 4.22236539e-04 1.82159553e-07 1.24428872e-04
  6.02107919e-09 2.67569618e-07 2.06625380e-04 3.19923679e-08
  1.09121970e-07 8.45930353e-03 3.59997927e-07 3.32197523e-08
  4.57659451e-04 1.05843949e-03 2.08789501e-08 2.30769438e-05
  3.59998609e-07 1.45318673e-03 1.58216409e-08 1.82010552e-07
  3.68412402e-05 1.88779367e-07 9.00660723e-07 5.23898258e-09
  8.01263167e-10 2.58581867e-09 5.26705264e-07 2.79121068e-05
  6.05490325e-09 2.07812931e-08 1.94045133e-04 3.81205922e-07
  3.0521

In [None]:
tf_lite_ans = tf.argmax(output_data, axis = 1)

In [None]:
print(answer_vocab.idx2word(tf_lite_ans[0]))

red
