# Image Caption using Encoder and Decoder (CNN-RNN) # 

In this notebook we use merge architectures encoder-decoder recurrent neural network models on caption generation.
This involves two elements:

1. Encoder: pre-trained convolutional neural network model that reads the image input and encodes the content into a fixed-length vector using an internal representation. Output of the encoder is an hidden unit/context generated by reading the input which will be passed to all the decoder.
2. Decoder: model that reads the encoded image and generates the textual description output.

In merge model architecture,  combines both the encoded form of the image input with the encoded form of the text description generated. The combination of these two encoded inputs is then used by a very simple decoder model to generate the next word in the sequence. The approach uses the recurrent neural network only to encode the text generated so far.

![Merge Architecture for Encoder-Decoder Model](..\images\merge_model.png)

Reference: https://machinelearningmastery.com/caption-generation-inject-merge-architectures-encoder-decoder-model/

In [1]:
import src.utils as plp
#import pandas as pd
#import numpy as np
import os
import gc

PROJECT_ROOT = plp.get_project_root()

PROJECT_IMAGE_ROOT = os.path.join(PROJECT_ROOT, 'data', 'processed', 'image_feature_extracted_using_inception') 

In [None]:
# Detect TPU,multiple GPU, return appropriate distribution strategy

import tensorflow as tf

is_TPU_instance_Init = False
is_Multiple_GPU_instance_Init = False

num_replicas_in_sync = 1

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
    is_TPU_instance_Init = True
    
except ValueError:
    tpu = None

if tpu:
    
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    
    num_replicas_in_sync = strategy.num_replicas_in_sync
    print("REPLICAS: ", strategy.num_replicas_in_sync)
    
else: #Check for multiple GPU
    
    #Setting for multipl GPU https://towardsdatascience.com/train-a-neural-network-on-multi-gpu-with-tensorflow-42fa5f51b8af
    #to see the list of available GPU devices doing the following
    devices = tf.config.experimental.list_physical_devices('GPU')
    num_replicas_in_sync = len(devices)
    
    if num_replicas_in_sync > 1:
        is_Multiple_GPU_instance_Init = True
        
    #Detect multiple GPU then distribute the task on multiple machine
    strategy = tf.distribute.MirroredStrategy() #To Supress the warning duing run https://github.com/tensorflow/tensorflow/issues/42146
    #strategy = tf.distribute.MultiWorkerMirroredStrategy()
    options = tf.data.Options()    
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

if ((is_Multiple_GPU_instance_Init == False) & (is_TPU_instance_Init == False)):
    strategy = tf.distribute.get_strategy() 
    num_replicas_in_sync = 1
    print('General strategy...')

In [3]:
import pyarrow.parquet as pq
import numpy as np
import gc

def next_dbset_batch(parquet_obj, size = 10000):
    
    for dbset in parquet_obj.iter_batches(batch_size = size, columns = ['image_id', 'in_seq', 'out_seq']):
        yield dbset

#This perfrom worst took 1min 11s for batch size of 90.
def data_generator(name, batch_size): 
    
    #print(name) output: b'train_data.h5'. Here 'b' in output mease byte representation
    name = str(name, 'UTF-8') #This  convert bytes to a string
    
    if 'valid' in name:
        
        #dbset = valid_data
        DATASET_FILEPATH = 'valid_in_seq_data.parquet'    
        
    else:
        
        #dbset = train_data
        DATASET_FILEPATH = 'train_in_seq_data.parquet'
    
    
    DATASET_FILEPATH = os.path.join(PROJECT_ROOT, 'data', 'processed', DATASET_FILEPATH) 
    parquet_obj = None
    
    #while not 'valid' in name:
    while True:
        
        if parquet_obj == None:
            parquet_obj = pq.ParquetFile(DATASET_FILEPATH)    

        #records_cnt = hdf5_file[x_name].shape[0]
        for dbset in next_dbset_batch(parquet_obj, batch_size): #read the data in chunk            

            dbset = dbset.to_pandas()
            records_cnt = dbset.shape[0]

            #since we know that the records will be in sequence of same input text
            prev_image_id = -1
            for idx in range(records_cnt):

                image_id = dbset.loc[idx, 'image_id']

                if prev_image_id != image_id: #If previous image_id and current image_idx not same load the new in_image numpy array. Else use the previous numpy array    

                    img_path = os.path.join(PROJECT_IMAGE_ROOT, str(image_id) + '.npy')      
                    in_image = np.load(img_path)

                in_seq = dbset.loc[idx, 'in_seq']
                out_seq = dbset.loc[idx, 'out_seq']

                yield ((in_image, in_seq.tolist()), out_seq.tolist()) 

                prev_image_id = image_id

            del [dbset, in_image, in_seq, out_seq]
            gc.collect()

        parquet_obj = None
                    

In [4]:
max_in_seq_len = 15
vocab_size = 10613
train_record_cnt = 129214
valid_record_cnt = 13747

(max_in_seq_len, vocab_size, train_record_cnt)

(15, 10613, 129214)

## HyperParamater Tuning ##

In [5]:
batch_size = 2048 #1024
batch_size = batch_size * num_replicas_in_sync

batch_size

4096

In [6]:
from tensorflow.data import Dataset

train_batch = (Dataset
              .from_generator(data_generator, 
                              args = ['train', 25000], #batch_size
                              output_signature = (
                                                    (
                                                      tf.TensorSpec(shape = (2048, ), dtype = tf.float16), 
                                                      tf.TensorSpec(shape = (max_in_seq_len,), dtype = tf.int32)
                                                    ),
                                                    tf.TensorSpec(shape = (vocab_size,), dtype = tf.float16)
                                                  )
                            )  
              .batch(batch_size)   
              .prefetch(tf.data.AUTOTUNE)   
              )


In [7]:
valid_batch = (Dataset
              .from_generator(data_generator, 
                              args = ['valid', 25000], #batch_size
                              output_signature = (
                                                    (
                                                      tf.TensorSpec(shape = (2048, ), dtype = tf.float16), 
                                                      tf.TensorSpec(shape = (max_in_seq_len,), dtype = tf.int32)
                                                    ),
                                                    tf.TensorSpec(shape = (vocab_size, ), dtype = tf.float16)
                                                 )
                            )
              .batch(batch_size) 
              .prefetch(tf.data.AUTOTUNE)
              )              


In [None]:
#Use keras for Distributed Tuning

#Use "export" to set environment varaible in bash
#export KERASTUNER_TUNER_ID = "chief"
#export KERASTUNER_ORACLE_IP = "127.0.0.1"
#export KERASTUNER_ORACLE_PORT = "8000"

#Use "%env" to set enviorment varaibe in jupiter notebook

#%env KERASTUNER_TUNER_ID = "chief"
#%env KERASTUNER_ORACLE_IP = "127.0.0.1"
#%env KERASTUNER_ORACLE_PORT = "8000"
## %env GRPC_VERBOSITY = 'debug'  #environment variable to see detailed error message. #Did not help with the error "RuntimeError: Failed to bind to address "127.0.0.1":"8000"; set GRPC_VERBOSITY=debug environment variable to see detailed error message."

In [None]:
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras.utils import plot_model
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.layers import Dropout
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Add
import keras_tuner

with strategy.scope():  
    
    class HyperParamTunerModel(keras_tuner.HyperModel):
        
        def __init__(self, max_length, vocab_size):
            
            super(HyperParamTunerModel, self).__init__()
            
            self.max_length = max_length
            self.vocab_size = vocab_size
            
        # define the model
        def build(self, hp):

            # features from the CNN model squeezed from 2048 to 256 nodes    

            image_extract = Input(shape = (2048,), name = 'np_image')
            dence_units = hp.Int("dence_units", min_value = 256, max_value = 1024, step = 32)
            
            fe1 = Dropout(hp.Float("img_dropout", min_value = 0.5, max_value = 0.7, step = 0.05))(image_extract)
            fe2 = Dense(units = dence_units, activation = 'relu')(fe1)

            # LSTM sequence model
            inputs_caption = Input(shape = (self.max_length,), name = 'word_seq')

            se1 = Embedding(self.vocab_size, 
                            hp.Int("emb_units", min_value = 256, max_value = 1024, step = 32), 
                            mask_zero = True)(inputs_caption)
            se2 = Dropout(hp.Float("emb_dropout", min_value = 0.5, max_value = 0.7, step = 0.05))(se1)
            se3 = LSTM(dence_units)(se2)

            # Merging both models
            decoder1 = Add()([fe2, se3])
            decoder2 = Dense(hp.Int("merge_units", min_value = 256, max_value = 1024, step = 32), activation = 'relu')(decoder1)

            outputs = Dense(self.vocab_size, activation = 'softmax', name = 'output_seq')(decoder2)

            # tie it together [image, seq] [word]
            model = Model(inputs = [image_extract, inputs_caption], outputs = outputs)

            #define optimizers
            lr = hp.Float("lr", min_value = 1e-4, max_value = 1e-1, sampling = "log")#lr = 0.0001 
            lr = lr * num_replicas_in_sync
            adam_optimizers  = Adam(learning_rate = lr)

            #define loss
            entropy_loss = CategoricalCrossentropy(from_logits = False)

            model.compile(loss = entropy_loss, optimizer = adam_optimizers)
            return model
        
        
        def fit(self, hp, model, *args, **kwargs):
            
            return model.fit(
                            *args,
                            # Tune whether to shuffle the data in each epoch.
                            **kwargs,
                        )     
    


In [None]:
tuner = keras_tuner.RandomSearch(
                                hypermodel = HyperParamTunerModel(max_in_seq_len, vocab_size),
                                objective = "val_loss",
                                max_trials = 10,
                                seed = 44,
                                executions_per_trial = 1,
                                distribution_strategy = strategy,
                                overwrite = True,
                                directory = os.path.join(PROJECT_ROOT, "data", "hyper_param_search_result"),
                                project_name = "image_caption_hyper_param_search",
                            )


In [None]:
#Start the search
import math

tuner.search(x = train_batch,
             steps_per_epoch = math.ceil(train_record_cnt/batch_size),
             epochs = 25,
             shuffle = False,
             verbose = 1,
             validation_data = valid_batch,
             validation_steps = math.ceil(valid_record_cnt/batch_size),
             callbacks = [EarlyStopping(monitor = 'val_loss', patience = 1)],
            )

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

In [None]:
for h_param in ['dence_units', 'img_dropout', 'emb_units', 'emb_dropout', 'merge_units', 'lr']:
      print(h_param, tuner.get_best_hyperparameters()[0].get(h_param))

In [None]:
# Zip Hyperparamter output results. 
# When download output from kaggel it was easier

"""
import zipfile

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))
    
zip_output_path = os.path.join(PROJECT_ROOT, "data", "hyper_param_search_result.zip")
zip_folder_path = os.path.join(PROJECT_ROOT, "data", "hyper_param_search_result")
zipf = zipfile.ZipFile(zip_output_path, 'w', zipfile.ZIP_DEFLATED)
zipdir(zip_folder_path, zipf)
zipf.close()  

#!cd '/kaggle/working'
#!ls
##Output url link to download
#from IPython.display import FileLink
#FileLink(r'hyper_param_search_result.zip')
"""

#### Hyperparamter Tuning Results: ####

<table>
    <tr>
        <td>Trial</td>
        <td>dence_units</td>
        <td>img_dropout</td>
        <td>emb_units</td>
        <td>emb_dropout</td>
        <td>merge_units</td>
        <td>lr</td>
        <td>val_loss</td>
    </tr>
    <tr>
        <td>0</td>
        <td>576</td>
        <td>0.55</td>
        <td>928</td>
        <td>0.55</td>
        <td>672</td>
        <td>0.00015872740971656704</td>
        <td>4.634195327758789</td>
    </tr>
    <tr>
        <td>1</td>
        <td>640</td>
        <td>0.55</td>
        <td>992</td>
        <td>0.6</td>
        <td>960</td>
        <td>0.0001866463763857711</td>
        <td>4.594254016876221</td>
    </tr>
    <tr>
        <td>2</td>
        <td>992</td>
        <td>0.5</td>
        <td>704</td>
        <td>0.55</td>
        <td>480</td>
        <td>0.003044123781552541</td>
        <td>4.705258369445801</td>
    </tr>
    <tr>
        <td>3</td>
        <td>960</td>
        <td>0.6</td>
        <td>608</td>
        <td>0.60</td>
        <td>320</td>
        <td>0.00016764139674983655</td>
        <td>4.707552909851074</td>
    </tr>
    <tr>
        <td>4</td>
        <td>832</td>
        <td>0.65</td>
        <td>960</td>
        <td>0.55</td>
        <td>320</td>
        <td>0.0006909496810943752</td>
        <td>4.562685012817383</td>
    </tr>
    <tr>
        <td>5</td>
        <td>928</td>
        <td>0.60</td>
        <td>544</td>
        <td>0.65</td>
        <td>896</td>
        <td>0.0002729031103694528</td>
        <td>4.625041961669922</td>
    </tr>
    <tr>
        <td>6</td>
        <td>480</td>
        <td>0.60</td>
        <td>352</td>
        <td>0.60</td>
        <td>832</td>
        <td>0.0003969777071876592</td>
        <td>4.620487213134766</td>
    </tr>
    <tr>
        <td>7</td>
        <td>864</td>
        <td>0.55</td>
        <td>576</td>
        <td>0.5</td>
        <td>416</td>
        <td>0.0001780580297315219</td>
        <td>4.705492973327637</td>
    </tr>    
    <tr>
        <td>8</td>
        <td>576</td>
        <td>0.70</td>
        <td>672</td>
        <td>0.65</td>
        <td>544</td>
        <td>0.00038336436721221595</td>
        <td>4.599141597747803</td>
    </tr>
    <tr>
        <td>9</td>
        <td>544</td>
        <td>0.60</td>
        <td>352</td>
        <td>0.60</td>
        <td>384</td>
        <td>0.08652674373905335</td>
        <td>6.403066635131836</td>
    </tr>
</table>

**Observation:**

Top 2 paramaters where validation loss is less.

<table>
    <tr>
        <td>Trial</td>
        <td>dence_units</td>
        <td>img_dropout</td>
        <td>emb_units</td>
        <td>emb_dropout</td>
        <td>merge_units</td>
        <td>lr</td>
        <td>val_loss</td>
    </tr>
    <tr>
        <td>1</td>
        <td>640</td>
        <td>0.55</td>
        <td>992</td>
        <td>0.6</td>
        <td>960</td>
        <td>0.0001866463763857711</td>
        <td>4.594254016876221</td>
 </tr>
<tr>
        <td>4</td>
        <td>832</td>
        <td>0.65</td>
        <td>960</td>
        <td>0.55</td>
        <td>320</td>
        <td>0.0006909496810943752</td>
        <td>4.562685012817383</td>
 </tr>
</table>

Let's continue to test futher with more epoch the result of validation for more epoch


## Training ##
Train futher with top 3 best paramaters 

### Model ###

In [8]:
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras.utils import plot_model
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.layers import Dropout
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Add
import math

with strategy.scope():  
    
    def define_model(max_length, vocab_size, dence_units, img_dropout, emb_units, emb_dropout, merge_units, lr):
        
        image_extract = Input(shape = (2048,), name = 'np_image')

        fe1 = Dropout(img_dropout)(image_extract)
        fe2 = Dense(units = dence_units, activation = 'relu')(fe1)

        # LSTM sequence model
        inputs_caption = Input(shape = (max_length,), name = 'word_seq')

        se1 = Embedding(vocab_size, 
                        emb_units, 
                        mask_zero = True)(inputs_caption)
        se2 = Dropout(emb_dropout)(se1)
        se3 = LSTM(dence_units)(se2)

        # Merging both models
        decoder1 = Add()([fe2, se3])
        decoder2 = Dense(merge_units, activation = 'relu')(decoder1)

        outputs = Dense(vocab_size, activation = 'softmax', name = 'output_seq')(decoder2)

        # tie it together [image, seq] [word]
        model = Model(inputs = [image_extract, inputs_caption], outputs = outputs)

        #define optimizers
        #lr = lr * num_replicas_in_sync
        adam_optimizers  = Adam(learning_rate = lr)

        #define loss
        entropy_loss = CategoricalCrossentropy(from_logits = False)

        model.compile(loss = entropy_loss, optimizer = adam_optimizers)
        
        return model

* **Trial: 1**
    * dence_units: 640 
    * img_dropout: 0.55
    * emb_units: 992
    * emb_dropout: 0.6 
    * merge_units: 960
    * lr: 0.0001866463763857711

In [21]:
#define model
train_model = define_model(max_in_seq_len, vocab_size,
                           dence_units = 640 , 
                           img_dropout = 0.55 , 
                           emb_units = 992, 
                           emb_dropout = 0.6, 
                           merge_units = 960, 
                           lr = 0.0001866463763857711)

history = train_model.fit(x = train_batch,
                          steps_per_epoch = math.ceil(train_record_cnt/batch_size),
                          epochs = 25,
                          shuffle = False,
                          verbose = 1,
                          validation_data = valid_batch,
                          validation_steps = math.ceil(valid_record_cnt/batch_size),             
                       )

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [23]:
import pandas as pd

hist_df = pd.DataFrame(history.history) 

hist_df.to_parquet(os.path.join(PROJECT_ROOT, 'data', 'encoder-decoder-neural network', 'result_dence_units_640,img_dropout_55,emb_units_992,emb_dropout_60,merge_units_960,lr_0_0001866463763857711.parquet'))

In [24]:
hist_df.head()

Unnamed: 0,loss,val_loss
0,7.410568,6.40634
1,6.080339,5.978259
2,5.653186,5.565533
3,5.263391,5.275986
4,4.917656,5.024098


**Observation:**

### Model is overfitting on train data ###

* **Trial: 2**
    * dence_units: 832 
    * img_dropout: 0.65
    * emb_units: 960
    * emb_dropout: 0.55 
    * merge_units: 320
    * lr: 0.0006909496810943752

In [9]:
#define model
train_model = define_model(max_in_seq_len, vocab_size,
                           dence_units = 832 , 
                           img_dropout = 0.65 , 
                           emb_units = 960, 
                           emb_dropout = 0.55, 
                           merge_units = 320, 
                           lr = 0.0006909496810943752)

history = train_model.fit(x = train_batch,
                          steps_per_epoch = math.ceil(train_record_cnt/batch_size),
                          epochs = 50,
                          shuffle = False,
                          verbose = 1,
                          validation_data = valid_batch,
                          validation_steps = math.ceil(valid_record_cnt/batch_size),             
                       )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [10]:
import pandas as pd

hist_df = pd.DataFrame(history.history) 

hist_df.to_parquet(os.path.join(PROJECT_ROOT, 'data', 'encoder-decoder-neural network', 'result_dence_units_832,img_dropout_65,emb_units_960,emb_dropout_55,merge_units_320,lr_0_0006909496810943752.parquet'))

**Observation:**

### Model is overfitting on train data ###

* **Trial: 3**
    * dence_units: 576 
    * img_dropout: 0.70
    * emb_units: 672
    * emb_dropout: 0.65
    * merge_units: 544
    * lr: 0.00038336436721221595

In [None]:
#define model
train_model = define_model(max_in_seq_len, vocab_size,
                           dence_units = 576 , 
                           img_dropout = 0.70 , 
                           emb_units = 672, 
                           emb_dropout = 0.65, 
                           merge_units = 544, 
                           lr = 0.00038336436721221595)

history = train_model.fit(x = train_batch,
                          steps_per_epoch = math.ceil(train_record_cnt/batch_size),
                          epochs = 50,
                          shuffle = False,
                          verbose = 1,
                          validation_data = valid_batch,
                          validation_steps = math.ceil(valid_record_cnt/batch_size),             
                       )

In [None]:
import pandas as pd

hist_df = pd.DataFrame(history.history) 

hist_df.to_parquet(os.path.join(PROJECT_ROOT, 'data', 'encoder-decoder-neural network', 'result_dence_units_576,img_dropout_70,emb_units_672,emb_dropout_65,merge_units_544,lr_0_00038336436721221595.parquet'))

In [None]:
train_model.save(os.path.join(PROJECT_ROOT, 'data', 'encoder-decoder-neural network','my_model.h5'))