In [None]:
!pip install opendatasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as Keras
from tensorflow.keras.layers import Dense, Flatten, Input, Conv2D, MaxPooling2D, LSTM, AdditiveAttention, Embedding, Dropout, TimeDistributed, AdditiveAttention, Reshape
from keras.optimizers import Adam, SGD

import re
import spacy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tempfile

import opendatasets as od
import os
import json

from sklearn.model_selection import train_test_split

In [None]:
print(tf.__version__)

Problems I faced during this Project:


1. Converting image to numpy array requires RAM.
> Solution tried,
        1. OpenCV packages to process images
        2. Image PIL package
        3. pySpark for InMemory space

  Failed at every try and finally found numpy.memmap
  This memmap store the data in disk and load the data from disk on request

2. Creating custom CNN for this project instead of pre-trained model
> building model based on the desired input size will eat your RAM. I build the CNN input layer with 500*500*3 with ResNet-50 architecture. That day I found building the model requires the RAM since we need to store the neural network weight.
> Solution worked
        1. reduced the input layer dimension
        2. removed some convolutional and pooling layer
  Once reducing the input layer dimension, the model size reduced from GBs to MBs

3. model.fit loads the data into RAM and then train the model. But our input data doesn't fit in RAM. So, approached different solution
> Solutions,
          1. fit_generator - deprecated in Keras
          2. train_on_batch

4. Attention layer's Context vector doesn't fit with Embedding layer. Context Vector will be produced at each timestep. But Embedding layer (during training), embeds all the input at a time. Eg, in my model it takes 20 words. Embedding layer embeds 20 words at a time. But context vector should be generated at timestep
> Solution
          1. Placed the Attention layer after LSTM. Most of the time Attenstion layers are used after LSTM in NLP.

5. sklearn test_train_split packes  requires RAM
> Solution,
          1. using panda DataFrame using frac
          2. separate last 20% of data for testing

6. Creating predtion model from trained model.






















In [None]:
!rm -r ./flickr8k

In [None]:
od.download('https://www.kaggle.com/datasets/adityajn105/flickr8k?resource=download&select=Images')

In [None]:
captionData = pd.read_csv('./flickr8k/captions.txt', delimiter='.jpg,', header=0, names=['image', 'caption'])
captionData.head()

In [None]:
#captionData = captionData.sample(frac=1)
#captionData.head()

In [None]:
# No. of captions
captionData.count()

In [None]:
# No. of images
!ls ./flickr8k/Images | wc -l

In [None]:
newDimensionL, newDimensionB, newChannel = (250, 250, 3)
padColorRGB = (0, 0, 0)

In [None]:
totalDataSize = captionData['caption'].count()

In [None]:
# Working piece of code
'''
imgArrayDictionary = {}
count = 0
imageList = []

for imgFile in os.listdir('./flickr8k/Images'):
  fullPath = './flickr8k/Images/' + imgFile
  img = Keras.utils.load_img(fullPath)

  img = img.resize((newDimensionL, newDimensionB))
  imgArray = Keras.utils.img_to_array(img, dtype=np.float16) / 255

  imgArrayDictionary[imgFile] = imgArray

  count += 1
  if (count % 100 == 0):
    print(count)
'''

In [None]:
horizontalImgGenerator = ImageDataGenerator(
  horizontal_flip=True
)

tiltedImgGenerator = ImageDataGenerator(
  rotation_range=10
)

# Combined one
imgGenerator = ImageDataGenerator(
  horizontal_flip=True,
  rotation_range=10
)

In [None]:
# creating tempfile to store the numpy array in disk
tempFileName = tempfile.TemporaryFile()
imageMemFile = np.memmap(tempFileName, dtype=np.float16, shape=(totalDataSize, newDimensionL, newDimensionB, newChannel), mode='w+')

count = 0

for imageName in captionData['image']:

  fullPath = './flickr8k/Images/' + imageName + '.jpg'
  img = Keras.utils.load_img(fullPath)

  img = img.resize((newDimensionL, newDimensionB))
  imgArray = Keras.utils.img_to_array(img, dtype=np.float16) / 255

  dummyArray = np.zeros((1, newDimensionL, newDimensionB, newChannel))
  dummyArray[0] = img
  if count%5 == 1:
    # For Horizontal Flip Image Generation
    augumentIterator = horizontalImgGenerator.flow(dummyArray, batch_size=1)
    augumentedImageArray = next(augumentIterator)[0].astype('uint8')

    imgArray = Keras.utils.img_to_array(augumentedImageArray, dtype=np.float16) / 255

  elif count%5 == 2:
    # For Tilted Image Generation
    augumentIterator = tiltedImgGenerator.flow(dummyArray, batch_size=1)
    augumentedImageArray = next(augumentIterator)[0].astype('uint8')

    imgArray = Keras.utils.img_to_array(augumentedImageArray, dtype=np.float16) / 255

  elif count%5 == 3 or count%5 == 4:
    # Combined Image Generator
    augumentIterator = imgGenerator.flow(dummyArray, batch_size=1)
    augumentedImageArray = next(augumentIterator)[0].astype('uint8')

    imgArray = Keras.utils.img_to_array(augumentedImageArray, dtype=np.float16) / 255

  imageMemFile[count] = imgArray
  count += 1

  # Working piece
  '''
  #imgArray = imageToArray[imgArrayDictionary[imageName]]
  imgArray = imgArrayDictionary[imageName]
  imageMemFile[count] = imgArray
  count += 1
  '''

  if (count % 500 == 0):
    print(count)

print('Image count: ', imageMemFile.shape)

In [None]:
captions = []
for captionText in captionData['caption']:
  captionText = re.sub('[^\w^\s^\']', '', captionText.lower())
  captionText = re.sub('\s+', ' ', captionText)
  captions.append(captionText)

print('Caption count: ', len(captions))

In [None]:
nlp = spacy.load('en_core_web_sm', exclude=['ner', 'parser', 'attribute_ruler', 'tagger', 'lemmatizer'])

In [None]:
# Hyper parameter
############################
threshold = 3
maxPadLength = 20
############################

vocab = {}
sentLength = []
index = 4

# This loop is to get vocabulary count
for caption in captions:
  sentLength.append(len(caption.split(' ')))
  #for word in caption.split(' '):
  doc = nlp(caption)
  for token in doc:
    word = token.text

    if vocab.get(word):
      vocab[word]['count'] += 1
    else:
      #vocab[word]['count'] = 1
      #vocab[word]['index'] = index
      vocab[word] = {'count': 1, 'index': index, 'embedding': token.vector}
      index += 1

# indexing special token
#   here count = 10, is just a dummy value
vocab['<PAD>'] = {'count': 10, 'index': 0, 'embedding': np.random.uniform(-1, 1, (96,))}
vocab['<SOS>'] = {'count': 10, 'index': 1, 'embedding': np.random.uniform(-1, 1, (96,))}
vocab['<EOS>'] = {'count': 10, 'index': 2, 'embedding': np.random.uniform(-1, 1, (96,))}
vocab['<OOV>'] = {'count': 10, 'index': 3, 'embedding': np.random.uniform(-1, 1, (96,))}

plt.hist(sorted(sentLength))
plt.show()

# index to word mapping
idx2word = {}
for key in vocab.keys():
  idx = vocab[key]['index']
  idx2word[idx] = key


In [None]:
with open('index_to_word.json', 'w') as f:
    json.dump(idx2word, f)

In [None]:
tempCaptionFile = tempfile.TemporaryFile()
#tempCaptionFile.detach()
captionMemFile = np.memmap(tempCaptionFile, mode='w+', dtype=np.uint32, shape=(totalDataSize,maxPadLength))
captionMemFile.flush()

print(captionMemFile.shape)

count = 0
for caption in captions:
  captionIndex = []
  captionIndex.append(vocab['<SOS>']['index'])
  #for word in caption.split(' '):
  doc = nlp(caption)
  for token in doc:
    word = token.text
    if word == '':
      continue
    if vocab[word]['count'] < threshold:
      captionIndex.append(vocab['<OOV>']['index'])
    else:
      captionIndex.append(vocab[word]['index'])
    #captionIndex.append([vocab[word]['index']])
  captionIndex.append(vocab['<EOS>']['index'])

  captionArray = np.asarray(captionIndex)
  padLength = maxPadLength - len(captionIndex)
  if padLength < 0:
    captionMemFile[count] = captionArray[0: maxPadLength]
    captionMemFile[count][maxPadLength-1] = 2
  else:
    captionMemFile[count] = np.pad(captionArray, (0, padLength))
  count += 1

  if (count % 500 == 0):
    print(count)

In [None]:
vocab['family']

In [None]:
vocab['family']

In [None]:
# create embedding weight for embedding layer
embeddingMatrix = np.zeros((index+1, 96))
for word in vocab.keys():
  i = vocab[word]['index']
  embeddingMatrix[i] = vocab[word]['embedding']


In [None]:
embeddingMatrix[3000]

In [None]:
# To verify everything as expected
print(captions[21003])
print(captionMemFile[21003])
sampleImage = imageMemFile[21003] * 255

plt.imshow(sampleImage.astype(int))
plt.show()

In [None]:
# To verify everything as expected
print(captions[21003])
print(captionMemFile[21003])
sampleImage = imageMemFile[21003] * 255

plt.imshow(sampleImage.astype(int))
plt.show()

In [None]:
# To verify everything as expected
print(captions[3000])
print(captionMemFile[3000])
sampleImage = imageMemFile[3000] * 255

plt.imshow(sampleImage.astype(int))
plt.show()

In [None]:
# To verify everything as expected
print(captions[3000])
print(captionMemFile[3000])
sampleImage = imageMemFile[3000] * 255

plt.imshow(sampleImage.astype(int))
plt.show()

In [None]:
# To verify everything as expected
print(captions[39374])
print(captionMemFile[39374])
sampleImage = imageMemFile[39374] * 255

plt.imshow(sampleImage.astype(int))
plt.show()

In [None]:
# To verify everything as expected
print(captions[39374])
print(captionMemFile[39374])
sampleImage = imageMemFile[39374] * 255

plt.imshow(sampleImage.astype(int))
plt.show()

In [None]:
# Split data for training and testing
# working pieceof code
trainDataSizePercetage = 80

limit = totalDataSize * trainDataSizePercetage // 100
# Train Data
xTrain, yTrain = imageMemFile[:limit], captionMemFile[:limit]

# Test Data
xTest, yTest = imageMemFile[limit:], captionMemFile[limit:]

print(xTrain.shape[0], " -> ", yTrain.shape[0])
print(xTest.shape[0], " -> ", yTest.shape[0])


In [None]:
tempTrainFileName = tempfile.TemporaryFile()
xTrain = np.memmap(tempTrainFileName, dtype=np.float16, shape=(32364, newDimensionL, newDimensionB, newChannel), mode='w+')
tempTestFileName = tempfile.TemporaryFile()
xTest = np.memmap(tempTestFileName, dtype=np.float16, shape=(8091, newDimensionL, newDimensionB, newChannel), mode='w+')

tempTrainCaptionFile = tempfile.TemporaryFile()
yTrain = np.memmap(tempTrainCaptionFile, mode='w+', dtype=np.uint32, shape=(32364,maxPadLength))
tempTestCaptionFile = tempfile.TemporaryFile()
yTest = np.memmap(tempTestCaptionFile, mode='w+', dtype=np.uint32, shape=(8091,maxPadLength))

xTest, yTest = imageMemFile[4::5], captionMemFile[4::5]
print(xTest.shape[0], " -> ", yTest.shape[0])

exceptList = []
for i in range(1,8092):
  exceptList.append(i*5)

len(exceptList)

xTrain = np.delete(imageMemFile, exceptList)
yTrain = np.delete(captionMemFile, exceptList)
print(xTrain.shape[0], " -> ", yTrain.shape[0])

In [None]:
import numpy as np

a = np.array([1,2,3,4,5,6,7,8,9,0])
xyz = [2,5,8]

b = a[4::5]
c = np.delete(a, xyz)

print(b)
print(c)

In [None]:
a

In [None]:
import numpy as np

array = np.array([1, 2, 3, 4, 5])
indices = [i for i in range(len(array)) if i % 2 == 0]
subarrays = np.split(array, indices)
print(subarrays)

In [None]:
arr = np.array([5,7,9,11,13,19,23,27])
subarrays = np.split(arr, [2, 5, 7])

print(subarrays)

In [None]:
limit

In [None]:
index

In [None]:
# Hyper parameters
############################
batchSize = 15
epochs = 15
############################
vocabSize = index

In [None]:
'''
def encoderModel(vocabSize):

  inputLayer = Input(shape=(500,500,3), batch_size=batchSize)
  conv1 = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu')(inputLayer)
  #conv2 = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu')(conv1)

  maxPool1 = MaxPooling2D(pool_size=(20,20), strides=(4,4), padding='valid')(conv1)
  conv3 = Conv2D(filters=128, kernel_size=(2,2), padding='same', activation='relu')(maxPool1)

  maxPool2 = MaxPooling2D(pool_size=(4,4), strides=(2,2), padding='valid')(conv3)
  conv5 = Conv2D(filters=256, kernel_size=(2,2), padding='same', activation='relu')(maxPool2)

  maxPool3 = MaxPooling2D(pool_size=(4,4), strides=(2,2), padding='valid')(conv5)
  conv8 = Conv2D(filters=512, kernel_size=(2,2), padding='same', activation='relu')(maxPool3)

  maxPool4 = MaxPooling2D(pool_size=(8,8), strides=(2,2), padding='valid')(conv8)
  flattenNetwork = Flatten()(maxPool4)

  dropout1 = Dropout(0.3)(flattenNetwork)
  dense1 = Dense(2048, activation='relu')(dropout1)
  dropout2 = Dropout(0.3)(dense1)
  dense2 = Dense(1024, activation='relu')(dense1)

  #outputLayer = Dense(1024, activation='softmax')

  model = Keras.Model(
      inputs = [inputLayer],
      outputs = [dense2, dense2]
  )

  return model, [model.layers[-1].output, model.layers[-1].output]

encoder, encoderHiddenStates = encoderModel(vocabSize)
encoderOptimizer = Adam(learning_rate=0.001)

print(encoder.summary())
Keras.utils.plot_model(encoder, show_shapes=True)
'''

In [None]:
'''
def decoderModel(encoderHiddenStates, vocabSize):

  deinputLayer = Input(shape=(1,), batch_size=batchSize)
  embeddingLayer = Embedding(vocabSize+1, 100, input_length=10)(deinputLayer)
  decoderOutput, decoderHidden, decoderCell = LSTM(1024, activation='relu', return_sequences=True, return_state=True)(embeddingLayer, initial_state=encoderHiddenStates)
  decoderDense = Dense(vocabSize, activation='sigmoid')
  decoderTimeDist = TimeDistributed(decoderDense)(decoderOutput)

  model = Keras.Model(
      inputs = [eninputLayer, encoderHiddenStates],
      outputs = decoderTimeDist
  )

  return model, [decoderHidden, decoderCell]

decoder, decoderStates = decoderModel(encoderHiddenStates, vocabSize)
decoderOptimizer = Adam(learning_rate=0.001)

print(decoder.summary())
Keras.utils.plot_model(decoder, show_shapes=True)
'''

In [None]:
def imageCaptionModel():
  inputLayer = Input(shape=(newDimensionL, newDimensionB, 3), batch_size=None)
  conv1 = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu')(inputLayer)
  conv2 = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='sigmoid')(conv1)

  maxPool1 = MaxPooling2D(pool_size=(20,20), strides=(4,4), padding='valid')(conv2)
  conv3 = Conv2D(filters=128, kernel_size=(2,2), padding='same', activation='relu')(maxPool1)
  conv4 = Conv2D(filters=128, kernel_size=(2,2), padding='same', activation='sigmoid')(conv3)

  maxPool2 = MaxPooling2D(pool_size=(4,4), strides=(2,2), padding='valid')(conv4)
  conv5 = Conv2D(filters=256, kernel_size=(2,2), padding='same', activation='relu')(maxPool2)
  conv6 = Conv2D(filters=256, kernel_size=(2,2), padding='same', activation='sigmoid')(conv5)

  maxPool3 = MaxPooling2D(pool_size=(4,4), strides=(2,2), padding='valid')(conv6)
  conv7 = Conv2D(filters=512, kernel_size=(2,2), padding='same', activation='relu')(maxPool3)
  conv8 = Conv2D(filters=1024, kernel_size=(2,2), padding='same', activation='relu')(conv7)

  cNNHiddenState = Reshape((169, 1024))(conv8)

  maxPool4 = MaxPooling2D(pool_size=(8,8), strides=(2,2), padding='valid')(conv8)
  flattenNetwork = Flatten()(maxPool4)

  dropout1 = Dropout(0.1)(flattenNetwork)
  dense1 = Dense(2048, activation='relu')(dropout1)
  dropout2 = Dropout(0.1)(dense1)
  dense2 = Dense(1024, activation='sigmoid')(dropout2)

  encoderOutputAsState = [dense2, dense2]

  deinputLayer = Input(shape=(1), batch_size=None)
  embeddingLayer = Embedding(vocabSize+1, 96, input_length=1, weights=[embeddingMatrix], trainable=False)(deinputLayer)

  lstmOutput, decoderHidden, decoderCell = LSTM(1024, activation='relu', return_sequences=True, return_state=True)(embeddingLayer, initial_state=encoderOutputAsState)
  contextVector, attentionScore = AdditiveAttention()([lstmOutput, cNNHiddenState, cNNHiddenState], return_attention_scores=True)

  decoderDense = Dense(index, activation='softmax')(contextVector)

  imgCaptionModel = Keras.Model(
      inputs = [inputLayer, deinputLayer],
      outputs = decoderDense
  )
  return imgCaptionModel, inputLayer, dense2, cNNHiddenState, deinputLayer, decoderDense, decoderHidden, decoderCell

imgCaptionModel, encoderInputLayer, encoderOutputLayer, cNNHiddenState, decoderInputLayer, decoderOutputLayer, decoderHidden, decoderCell  = imageCaptionModel()

print(imgCaptionModel.summary())
Keras.utils.plot_model(imgCaptionModel, show_shapes=True)

In [None]:
imgCaptionModel.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=[Keras.metrics.SparseCategoricalCrossentropy])

In [None]:
# This is purely for training purpose
model = Keras.models.load_model('imageCaptioningModel_Epoch13.keras')

# remove this block
# due to GPU sortage, I'm doing this

In [None]:
from tqdm import tqdm

In [None]:
batchForTraining = (xTrain.shape[0] // batchSize)
batchForTesting = (xTest.shape[0] // batchSize)

trainingLossHistory = []
trainingAccuracyHistory = []
testingLossHistory = []
testingAccuracyHistory = []

for epoch in range(epochs):
  epochTrainingLoss, epochTrainingAccuracy, epochTestingLoss, epochTestingAccuracy = 0, 0, 0, 0
  for batch in tqdm(range(batchForTraining)):
    batchIndex = batch * batchSize
    batchLimit = batchIndex + batchSize
    loss, accuracy = imgCaptionModel.train_on_batch(x=[xTrain[batchIndex:batchLimit],yTrain[batchIndex:batchLimit]], y=yTrain[batchIndex:batchLimit])
    epochTrainingLoss += loss
    epochTrainingAccuracy += accuracy

  # Average loss and accuracy of Training Dataset per epoch
  epochTrainingLoss /= batchForTraining
  epochTrainingAccuracy /= batchForTraining

  trainingLossHistory.append(epochTrainingLoss)
  trainingAccuracyHistory.append(epochTrainingAccuracy)

  for batch in tqdm(range(batchForTesting)):
    batchIndex = batch * batchSize
    batchLimit = batchIndex + batchSize
    loss, accuracy = imgCaptionModel.test_on_batch(x=[xTrain[batchIndex:batchLimit],yTrain[batchIndex:batchLimit]], y=yTrain[batchIndex:batchLimit])
    epochTestingLoss += loss
    epochTestingAccuracy += accuracy

  # Average loss and accuracy of Testing Dataset per epoch
  epochTestingLoss /= batchForTesting
  epochTestingAccuracy /= batchForTesting

  testingLossHistory.append(epochTestingLoss)
  testingAccuracyHistory.append(epochTestingAccuracy)

  print("EPOCH: ", epoch, " Training Loss: ", epochTrainingLoss, " Training Accuracy: ", epochTrainingAccuracy, "\t\tTesting Loss: ", epochTestingLoss ," Testing Accuracy: ", epochTestingAccuracy)

  #modelName = "imageCaptioningModel_Epoch" + str(epoch) + ".keras"
  #imgCaptionModel.save(modelName)

In [None]:
imgCaptionModel.save('imageCaptionModel.keras')

In [None]:
imgTest = Keras.utils.load_img('./flickr8k/Images/101669240_b2d3e7f17b.jpg')

imgTest = imgTest.resize((newDimensionL, newDimensionB))
imgTestArray = Keras.utils.img_to_array(imgTest) / 255

plt.imshow(imgTestArray)
plt.show()

imgTestArray = Keras.utils.img_to_array(imgTest, dtype=np.float16) / 255

In [None]:
print(imgTestArray.shape)
imgTestArray = imgTestArray.reshape(1, 250, 250, 3)
print(imgTestArray.shape)

In [None]:
trainedImgCaptionModel = Keras.models.load_model('imageCaptioningModel_Epoch13.keras')
#Keras.utils.plot_model(trainedImgCaptionModel, show_shapes=True)

In [None]:
encoderPredictModel = Keras.Model(
  trainedImgCaptionModel.get_layer('input_1').input,
  [trainedImgCaptionModel.get_layer('dense_1').output] + [trainedImgCaptionModel.get_layer('conv2d_7').output]
)
encoderOutputTensorAsState = [encoderPredictModel.get_layer('dense_1').output, encoderPredictModel.get_layer('dense_1').output]

encoderPredictModel.save("imageCaptionEncoderModel.keras")
#print(encoderPredictModel.summary())
#Keras.utils.plot_model(encoderPredictModel, show_shapes=True)

In [None]:
# Building the decoder model for prediction
decoderPredInput = trainedImgCaptionModel.get_layer('input_2').input
decoderPredEmbedding = trainedImgCaptionModel.get_layer('embedding')(decoderPredInput)

decoderPredLSTM = trainedImgCaptionModel.get_layer('lstm')
decoderPredLSTMOutput, decoderPredHidden, decoderPredCell = decoderPredLSTM(decoderPredEmbedding, initial_state=encoderOutputTensorAsState)

decoderPredStates = [decoderPredHidden, decoderPredCell]

encoderHiddenState = encoderPredictModel.get_layer('conv2d_7').output
reshapedEncoderHiddenState = trainedImgCaptionModel.get_layer('reshape')(encoderHiddenState)

decoderPredAttention = trainedImgCaptionModel.get_layer('additive_attention')
decoderPredContextVector, decoderPredAlignmentWeight = decoderPredAttention([decoderPredLSTMOutput, reshapedEncoderHiddenState, reshapedEncoderHiddenState], return_attention_scores=True)

decoderPredOutput = trainedImgCaptionModel.get_layer('dense_2')(decoderPredContextVector)

decoderPredictModel = Keras.Model(
    [decoderPredInput] + [encoderOutputTensorAsState] + [encoderPredictModel.get_layer('conv2d_7').output],
    [decoderPredOutput] + [decoderPredStates]
)

# saving decoder prediction model
decoderPredictModel.save("imageCaptionDecoderModel.keras")
print(decoderPredictModel.summary())
Keras.utils.plot_model(decoderPredictModel, show_shapes=True)

In [None]:
with open('index_to_word.json', 'r') as fh:
  jsonData = json.load(fh)

In [None]:
encoderOutput = encoderPredictModel.predict(imgTestArray)

encoderPrediction = encoderOutput[0]
encoderPredHiddenLayer = encoderOutput[1]

In [None]:
# Decoder Prediction initilizer
wordIdx = np.ones((1,))
stateH = encoderPrediction
stateC = encoderPrediction

print("WORD INDEX: ", wordIdx)
print("stateH: ", stateH.shape)
print("stateC: ", stateC.shape)
hiddenStates = [stateH, stateC]
for i in range(20):

  decoderPrediction, hiddenStates = decoderPredictModel.predict([wordIdx] + [hiddenStates] + [encoderPredHiddenLayer])
  wordIdx[0] = decoderPrediction.argmax()

  print(wordIdx[0])

  #wordIdx[0] = decoderPrediction[0].argmax()

  '''
  stateH = decoderPrediction[1][0]
  stateC = decoderPrediction[1][1]
  encoderPredStates = [stateH, stateC]
  '''

  #if wordIdx[0] == 2:
  #  break
  print("wordIdx[0]", wordIdx[0] ," ",jsonData[str(int(wordIdx[0]))])

#model.predict(imgTestArray)

In [None]:
jsonData['4']

In [None]:
'''
qwert = Keras.Sequential()
qwert.add(imgCaptionModel.get_layer("input_1"))
qwert.add(imgCaptionModel.get_layer("conv2d"))
qwert.add(imgCaptionModel.get_layer("conv2d_1"))
qwert.add(imgCaptionModel.get_layer("max_pooling2d"))
qwert.add(imgCaptionModel.get_layer("conv2d_2"))
qwert.add(imgCaptionModel.get_layer("conv2d_3"))
qwert.add(imgCaptionModel.get_layer("max_pooling2d_1"))
qwert.add(imgCaptionModel.get_layer("conv2d_4"))
qwert.add(imgCaptionModel.get_layer("conv2d_5"))
qwert.add(imgCaptionModel.get_layer("max_pooling2d_2"))
qwert.add(imgCaptionModel.get_layer("conv2d_6"))
qwert.add(imgCaptionModel.get_layer("conv2d_7"))
qwert.add(imgCaptionModel.get_layer("max_pooling2d_3"))
qwert.add(imgCaptionModel.get_layer("flatten"))
qwert.add(imgCaptionModel.get_layer("dropout"))
qwert.add(imgCaptionModel.get_layer("dense"))
qwert.add(imgCaptionModel.get_layer("dropout_1"))
qwert.add(imgCaptionModel.get_layer("dense_1"))
'''

In [None]:
# Building the decoder model for prediction
'''
decoderPredInput = decoderInput
decoderPredEmbedding = decoderEmbedding(decoderPredInput)

decoderPredLSTM = decoderLSTM
decoderPredLSTMOutput, decoderPredHidden, decoderPredCell = decoderPredLSTM(decoderPredEmbedding, initial_state=abc)

decoderPredStates = [decoderPredHidden, decoderPredCell]

decoderPredAttention = additiveAttention
decoderPredContextVector, decoderPredAlignmentWeight = decoderPredAttention([decoderPredLSTMOutput, cNNHiddenState, cNNHiddenState], return_attention_scores=True)

decoderPredOutput = decoderOutputDenseLayer(decoderPredContextVector)

decoderPredictModel = Keras.Model(
    [decoderPredInput] + [abc] + [cNNHiddenState],
    [decoderPredOutput] + [decoderPredStates]
)

# saving decoder prediction model
decoderPredictModel.save("imageCaptionDecoderModel.keras")
'''

In [None]:
'''
# Building the encoder model for prediction
encoderPredictModel = Keras.Model(
    inputs=encoderInput,
    outputs=[encoderOutput, cNNHiddenState]
)
abc = [encoderOutput, encoderOutput]

encoderPredictModel.save("imageCaptionEncoderModel.keras")
encoderPredictModel.summary()
'''