# Image Caption using Encoder and Decoder (CNN-RNN) # 

In this notebook we use merge architectures encoder-decoder recurrent neural network models on caption generation.
This involves two elements:

1. Encoder: pre-trained convolutional neural network model that reads the image input and encodes the content into a fixed-length vector using an internal representation. Output of the encoder is an hidden unit/context generated by reading the input which will be passed to all the decoder.
2. Decoder: model that reads the encoded image and generates the textual description output.

In merge model architecture,  combines both the encoded form of the image input with the encoded form of the text description generated. The combination of these two encoded inputs is then used by a very simple decoder model to generate the next word in the sequence. The approach uses the recurrent neural network only to encode the text generated so far.

![Merge Architecture for Encoder-Decoder Model](..\images\merge_model.png)

Reference: https://machinelearningmastery.com/caption-generation-inject-merge-architectures-encoder-decoder-model/

In [1]:

import src.utils as plp
#import pandas as pd
#import numpy as np
import os
import gc

PROJECT_ROOT = plp.get_project_root()

## Training  ##

In [12]:

import pyarrow.parquet as pq
import h5py

def next_dbset_batch(parquet_obj, size = 10000):
    
    for dbset in parquet_obj.iter_batches(batch_size = size, columns = ['image_id_idx', 'in_seq', 'out_seq']):
        yield dbset

def data_generator(name, batch_size): 
    
    #print(name) output: b'train_data.h5'. Here 'b' in output mease byte representation
    name = str(name, 'UTF-8') #This  convert bytes to a string
    #batch_size = batch_size.numpy()

    if 'valid' in name:

        #dbset = valid_data
        DATASET_FILEPATH = 'valid_in_seq_data.parquet'
        HDF5_FILEPATH = 'validate_data.h5'
        
    else:

        #dbset = train_data
        DATASET_FILEPATH = 'train_in_seq_data.parquet'
        HDF5_FILEPATH = 'train_data.h5'

    DATASET_FILEPATH = os.path.join(PROJECT_ROOT, 'data', 'processed', DATASET_FILEPATH) 
    parquet_obj = pq.ParquetFile(DATASET_FILEPATH)

    HDF5_FILEPATH = os.path.join(PROJECT_ROOT, 'data', 'processed', HDF5_FILEPATH)
    x_name = 'np_image'

    with h5py.File(HDF5_FILEPATH, 'r') as hdf5_file:        
        
        #records_cnt = hdf5_file[x_name].shape[0]
        for dbset in next_dbset_batch(parquet_obj, batch_size): #read the data in chunk

            dbset = dbset.to_pandas()
            records_cnt = dbset.shape[0]
            
            #since we know that the records will be in sequence of same input text
            prev_image_id_idx = -1
            for idx in range(records_cnt):

                image_id_idx = dbset.loc[idx, 'image_id_idx']
                #If previous image_id and current image_idx not same load the new in_image numpy array. Else use the previous numpy array    
                #This will reduce IO operation time required to read the image data again and help to retrive data for train faster
                if prev_image_id_idx != image_id_idx: 
                    in_image = hdf5_file[x_name][image_id_idx] 

                in_seq = dbset.loc[idx, 'in_seq']

                out_seq = dbset.loc[idx, 'out_seq']

                yield ((in_image, in_seq.tolist()), out_seq.tolist()) 
                #yield ((np.array(in_image), np.array(in_seq.tolist())), np.array(out_seq.tolist())) 

                prev_image_id_idx = image_id_idx
            
            del [dbset, in_image, in_seq, out_seq]
            gc.collect()

    parquet_obj.close()

"""
#Code to verify the code
%%time

import matplotlib.pyplot as plt
import time
start = time.time()
idx = 0
for (x,y),z in train_batch:
    
    end = time.time()
    print(f'Batch: {idx}, elapse time: {(end - start)}')
    
    i = 0
    for image in x.numpy():
        image = (image*255).astype(int)
        plt.imshow(image)
        plt.show()
        i+=1
        if i == 2 :
            break
    break
"""


In [None]:
# Detect TPU,multiple GPU, return appropriate distribution strategy

import tensorflow as tf

is_TPU_instance_Init = False
is_Multiple_GPU_instance_Init = False

num_replicas_in_sync = 1

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
    is_TPU_instance_Init = True
    
except ValueError:
    tpu = None

if tpu:
    
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    
    num_replicas_in_sync = strategy.num_replicas_in_sync
    print("REPLICAS: ", strategy.num_replicas_in_sync)
    
else: #Check for multiple GPU
    
    #Setting for multipl GPU https://towardsdatascience.com/train-a-neural-network-on-multi-gpu-with-tensorflow-42fa5f51b8af
    #to see the list of available GPU devices doing the following
    devices = tf.config.experimental.list_physical_devices('GPU')
    num_replicas_in_sync = len(devices)
    
    if num_replicas_in_sync > 1:
        is_Multiple_GPU_instance_Init = True
        
    #Detect multiple GPU then distribute the task on multiple machine
    #strategy = tf.distribute.MirroredStrategy() #To Suppress the warning duing run https://github.com/tensorflow/tensorflow/issues/42146
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    options = tf.data.Options()    
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

if ((is_Multiple_GPU_instance_Init == False) & (is_TPU_instance_Init == False)):
    strategy = tf.distribute.get_strategy() 
    num_replicas_in_sync = 1
    print('General strategy...')

In [3]:
# Since we shall generate data from the data_generatore,so did not open file.
#  Let's assign the values that we have learning during the pre-processing stage.

#max_in_seq_len = len(train_data.iloc[0, 1])
#vocab_size = len(train_data.iloc[0, 2])
#record_cnt = train_data.shape[0]

max_in_seq_len = 15
vocab_size = 10613
record_cnt = 129214 #Number of records in training dataset

(max_in_seq_len, vocab_size, record_cnt)

(15, 10613, 129214)

In [None]:
from keras.optimizers import Adam

with strategy.scope(): 
    adam_optimizers  = Adam(learning_rate = 0.0001)

In [None]:
from keras.losses import CategoricalCrossentropy

with strategy.scope(): 
    entropy_loss = CategoricalCrossentropy(from_logits = False)

In [None]:
from keras.applications.xception import Xception
from keras.models import load_model
from keras.utils import plot_model
from keras.layers import Embedding
from keras.layers import Dropout
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Add

with strategy.scope(): 

    # define the captioning model
    def define_model(vocab_size, max_length):
        
        # features from the CNN model squeezed from 2048 to 256 nodes    

        input_image = Input(shape = (1024, 1024, 3), name = 'np_image')

        xception_model = Xception( weights = 'imagenet', 
                                   include_top = False, 
                                   pooling = 'avg' , 
                                   input_tensor = Input(shape = (1024, 1024, 3))
                                )
        xception_model.trainable = False #Freeze the feature extraction layers
        image_extract = xception_model(input_image)

        fe1 = Dropout(0.5)(image_extract)
        fe2 = Dense(256, activation = 'relu')(fe1)

        # LSTM sequence model
        inputs_caption = Input(shape = (max_length,), name = 'word_seq')
        se1 = Embedding(vocab_size, 256, mask_zero = True)(inputs_caption)
        se2 = Dropout(0.5)(se1)
        se3 = LSTM(256)(se2)

        # Merging both models
        decoder1 = Add()([fe2, se3])
        decoder2 = Dense(256, activation = 'relu')(decoder1)
        outputs = Dense(vocab_size, activation = 'softmax', name = 'output_seq')(decoder2)
        
        # tie it together [image, seq] [word]
        model = Model(inputs = [input_image, inputs_caption], outputs = outputs)
        #model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
        
        # summarize model
        #print(model.summary())
        #plot_model(model, to_file ='model.png', show_shapes=True)
        
        return model

    model_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'my_model.h5' ) 

    if os.path.isfile(model_path):
        print('Loading previous saved model...')
        model = load_model(model_path)
    else:
        model = define_model(vocab_size, max_in_seq_len)
        
    model.compile(loss = entropy_loss, optimizer = adam_optimizers)

In [None]:
print(model.summary())
#plot_model(model, show_shapes = True)

In [7]:
batch_size = 32 #64
batch_size = batch_size * num_replicas_in_sync

batch_size

32

In [13]:
#Refered to decide the order: https://cs230.stanford.edu/blog/datapipeline/#best-practices
from tensorflow.data import Dataset

train_batch = (Dataset
              .from_generator(data_generator, 
                              args = ['train', 25000], 
                              output_signature = (
                                                    (
                                                      tf.TensorSpec(shape = (1024, 1024, 3), dtype = tf.float16), 
                                                      tf.TensorSpec(shape = (max_in_seq_len, ), dtype = tf.int32)
                                                    ),
                                                    tf.TensorSpec(shape = (vocab_size, ), dtype = tf.float16)
                                                 )
                            )  
              .batch(batch_size)            
              .prefetch(tf.data.AUTOTUNE)                                            
              )

#train_batch = train_batch.prefetch(batch_size).batch(batch_size)  
  
############-----------------------------------------------------------------------------------------######################
#train_generator = data_generator(db_train, token_obj, max_length, vocab_size, 'train_data.h5') #Was too slow

#Issue using from_tensor where we define the input data like bellow when access tensor image_idx, was not able to access the value
#from tensorflow import constant, uint16, int32, Variable
#out_seq_tensor = constant(np.array(train_data['out_seq'].values.tolist()), shape = (record_cnt, vocab_size), dtype = uint16)
#in_seq_tensor = constant(np.array(train_data['in_seq'].values.tolist()), shape = (record_cnt, max_in_seq_len), dtype = int32)
#image_idx_tensor = constant(train_data['image_id_idx'], shape = (record_cnt,), dtype = int32)
#or
#image_idx_tensor = train_data['image_id_idx']
#There was problem with from_tensor approach:
# 1. We need to access the image_id_id in "map" to get the numpy represenation of the image by reading hdf5 file.
#   In this problem was in map pass value of tensor. To get the value from tensor tried various way like as_numpy, eval, get_static_value etc.
#   But due to eagey execution it allway retrun error as "Tensor does not have attribute "map".
# 2. Reading the values of the dataframe column and then converting tolist() in high resource consumption. In laptop it always fail.
# 3. Each time it call "map" we need to open the hdf5 file and read the value and close. This add to the computation. 
# Advantage of using "map" is parallelization

#Sample to test 
#train_batch = train_batch.batch(128)
#for (x,y),z in train_batch:
    
#    print(x.shape)
#    print(y.shape)
#    print(y)
#    print(z.shape)
#    break
#Above same code took 1min 50sec to generate batch128 records . This is without prefetch

In [None]:
valid_batch = (Dataset
              .from_generator(data_generator, 
                              args = ['valid', 25000], 
                              output_signature = (
                                                    (
                                                      tf.TensorSpec(shape = (1024, 1024, 3), dtype = tf.float16), 
                                                      tf.TensorSpec(shape = (max_in_seq_len,), dtype = tf.int32)
                                                    ),
                                                    tf.TensorSpec(shape = (vocab_size, ), dtype = tf.float16)
                                                 )
                            )
              .batch(batch_size)            
              .prefetch(tf.data.AUTOTUNE)
              )

#valid_batch = valid_batch.prefetch(batch_size).batch(batch_size) 
           
#valid_generator = data_generator(db_valid, token_obj, max_length, vocab_size, 'validate_data.h5') #Was too slow

In [None]:
#fit the model
history = model.fit(train_batch,
                    #batch_size = GLOBAL_BATCH_SIZE
                    steps_per_epoch = record_cnt/batch_size,
                    epochs = 5,
                    shuffle = False,
                    #initial_epoch = 1, # We ran one epoch previously
                    verbose = 1,
                   )
                   
"""
history = model.fit(x = train_generator,
                    steps_per_epoch = db_train.shape[0],
                    
                    validation_data = valid_generator,
                    validation_steps = db_valid.shape[0],    
                    validation_freq = 2, 
                    
                    #batch_size = , #As per doc Do not specify the batch_size if your data is in the form of  generators
                    epochs = 10,                       
                    verbose = 1,

                    max_queue_size = 50,
                    workers = 10,
                    use_multiprocessing = True,
                  )
"""