In [1]:
import tensorflow as tf
import numpy as np
import pickle

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('question_vector.pickle','rb') as handle:
    question_vector=pickle.load(handle)

In [12]:
tokenizer.word_index

{'<unk>': 1,
 'the': 2,
 'is': 3,
 'what': 4,
 'are': 5,
 'this': 6,
 'in': 7,
 'a': 8,
 'on': 9,
 'how': 10,
 'many': 11,
 'of': 12,
 'color': 13,
 'there': 14,
 'man': 15,
 'does': 16,
 'people': 17,
 'picture': 18,
 'to': 19,
 'wearing': 20,
 'these': 21,
 'it': 22,
 'have': 23,
 'person': 24,
 'photo': 25,
 'do': 26,
 'where': 27,
 'or': 28,
 'kind': 29,
 'you': 30,
 'animal': 31,
 'room': 32,
 'woman': 33,
 'doing': 34,
 'they': 35,
 'be': 36,
 'animals': 37,
 'holding': 38,
 'type': 39,
 'can': 40,
 'dog': 41,
 'cat': 42,
 'any': 43,
 'at': 44,
 'for': 45,
 'he': 46,
 'water': 47,
 'his': 48,
 'train': 49,
 'that': 50,
 'see': 51,
 'food': 52,
 'an': 53,
 'shirt': 54,
 'which': 55,
 'playing': 56,
 'made': 57,
 'sport': 58,
 'bus': 59,
 'sitting': 60,
 'table': 61,
 'plate': 62,
 "man's": 63,
 'shown': 64,
 'plane': 65,
 'sign': 66,
 'taken': 67,
 'with': 68,
 'look': 69,
 'standing': 70,
 'right': 71,
 'pizza': 72,
 'all': 73,
 'left': 74,
 'background': 75,
 'boy': 76,
 'being'

In [13]:
question_vector

array([[  3,  14,   8, ...,   0,   0,   0],
       [  3,   6, 110, ...,   0,   0,   0],
       [  3,   6,   8, ...,   0,   0,   0],
       ...,
       [  3,   6,   8, ...,   0,   0,   0],
       [ 55, 697,  57, ...,   0,   0,   0],
       [ 27,   3,   2, ...,   0,   0,   0]])

In [4]:
img_width = 224
img_height = 224
BATCH_SIZE = 32
BUFFER_SIZE = 300

In [5]:
train_dataset = tf.data.TFRecordDataset.load('train_dataset.tfrecord',)
valid_dataset = tf.data.TFRecordDataset.load('validation_dataset.tfrecord')

In [6]:
def Build_BaseModel():
    image_input = tf.keras.layers.Input(shape=(7,7,512))
    question_input = tf.keras.layers.Input((question_vector.shape[1],))

    image_conv_layer1 = tf.keras.layers.Conv2D(filters = 512, kernel_size = 7 , strides = 1, padding = "valid", activation = 'relu',
                                               kernel_initializer = tf.keras.initializers.he_normal(seed=45))(image_input)

    image_flatten = tf.keras.layers.Flatten()(image_conv_layer1)

    image_dense_1 = tf.keras.layers.Dense(512, activation = tf.nn.relu, 
                                          kernel_initializer = tf.keras.initializers.he_uniform(seed=54))(image_flatten)
    
    image_dense_2 = tf.keras.layers.Dense(256, activation = tf.nn.relu, 
                                          kernel_initializer = tf.keras.initializers.he_uniform(seed=32))(image_dense_1)


    # Input 2 Pathway
    question_emb = tf.keras.layers.Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim = 300 ,name = "Embedding_Layer",
                                             embeddings_initializer = tf.keras.initializers.RandomNormal(mean=0, stddev=1, seed=23))(question_input)

    question_lstm = tf.keras.layers.LSTM(256, 
                                         kernel_initializer = tf.keras.initializers.glorot_uniform(seed=26),
                                         recurrent_initializer = tf.keras.initializers.orthogonal(seed=54),
                                         bias_initializer=tf.keras.initializers.zeros())(question_emb)

    question_flatten = tf.keras.layers.Flatten(name="Flatten_lstm")(question_lstm)

    
    image_question = tf.keras.layers.Multiply()([image_dense_2, question_flatten])


    image_question_dense_1 = tf.keras.layers.Dense(1000, activation = tf.nn.relu,
                                                    kernel_initializer = tf.keras.initializers.he_uniform(seed=19))(image_question)
    
    image_question_dense_2 = tf.keras.layers.Dense(1000, activation = tf.nn.relu, 
                                                   kernel_initializer = tf.keras.initializers.he_uniform(seed=28))(image_question_dense_1)

    output = tf.keras.layers.Dense(1000, activation=tf.nn.softmax, 
                                   kernel_initializer = tf.keras.initializers.glorot_normal(seed=15))(image_question_dense_2)

    # Create Model
    model = tf.keras.models.Model(inputs = [image_input, question_input], outputs = output)
    # Compile
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [7]:
l2_alpha = 0.001
ModelName = "BaselineModel"
model = Build_BaseModel()

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 7, 7, 512)]  0           []                               
                                                                                                  
 conv2d_2 (Conv2D)              (None, 1, 1, 512)    12845568    ['input_5[0][0]']                
                                                                                                  
 input_6 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 flatten_2 (Flatten)            (None, 512)          0           ['conv2d_2[0][0]']               
                                                                                              

In [None]:
model.fit(train_dataset, epochs = 20, validation_data = valid_dataset)

Epoch 1/20
  54/2062 [..............................] - ETA: 6:49 - loss: 4.8336 - accuracy: 0.2321