# CNN + LSTM Hypermodel V2
The second revision of this hypermodel takes out the sequnce length and vocab size from the tuning so that only the pre-trained models performance is evaluated

In [1]:
# general imports
import tensorflow as tf
import os
from pickle import load
from math import ceil
from numpy.random import choice

# model imports
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Dropout, Embedding, Add
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.losses import SparseCategoricalCrossentropy
from keras.regularizers import L2
from keras.utils import plot_model

# pre-trained cnn imports 
from keras.applications import VGG16, VGG19, ResNet50, ResNet152, ResNet50V2, ResNet152V2, InceptionV3, InceptionResNetV2, EfficientNetB0, EfficientNetB1, EfficientNetB2, NASNetMobile, ConvNeXtTiny, ConvNeXtBase, DenseNet121, DenseNet201   # type: ignore
from keras.applications import vgg16, vgg19, resnet, resnet_v2, inception_v3, inception_resnet_v2, efficientnet, nasnet, convnext, densenet

# tuning imports
from keras_tuner import HyperModel, RandomSearch

# custom function imports
from functions.dataset_loading import load_flicker8k_split
from functions.text_processing import create_vocab_mappings
from functions.image_processing import load_image, display_image
from functions.training import data_generator
from functions.model_evaluation import generate_and_evaluate_caption

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\james\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
SEQ_LENGTH = 20
HIDDEN_DIM = 1024
EMBED_DIM = 300
BATCH_SIZE = 64
VOCAB_SIZE = 4096

### Loading Fliker 8k pre-processed data

In [3]:
# Loading Flicker8K processed data from folder 
DIR = 'preprocessed_data/flicker8k/'

with open(DIR + 'caption_map.pkl', 'rb') as file:
    caption_map = load(file)
    
with open(DIR + 'feature_map.pkl', 'rb') as file:
    feature_map = load(file)

with open(DIR + 'embedding_matrix_glove_300.pkl', 'rb') as file:
    embedding_matrix = load(file)
    
with open(DIR + 'vocab.pkl', 'rb') as file:
    vocab = load(file)
    
train_images, val_images, test_images = load_flicker8k_split()

STEPS = ceil(len(train_images) * 5 / BATCH_SIZE)
VAL_STEPS = ceil(len(val_images) * 5 / BATCH_SIZE)

word_to_idx, idx_to_word = create_vocab_mappings(vocab)

In [4]:
# Extracting the image data for train and validation images
train_images, val_images, test_images = load_flicker8k_split()

train_images = train_images[:500]
val_images = val_images 

train_image_data, val_image_data, test_image_data = {}, {}, {}

for img in train_images:
    image_path = os.path.join('datasets/flicker8k/Flicker8k_images', img)
    image_data = tf.io.read_file(image_path)
    image_data = tf.image.decode_jpeg(image_data, channels=3)
    image_data = tf.image.resize(image_data, [224, 224])
    train_image_data[img] = image_data
    
for img in val_images:
    image_path = os.path.join('datasets/flicker8k/Flicker8k_images', img)
    image_data = tf.io.read_file(image_path)
    image_data = tf.image.decode_jpeg(image_data, channels=3)
    image_data = tf.image.resize(image_data, [224, 224])
    val_image_data[img] = image_data
    
for img in test_images:
    image_path = os.path.join('datasets/flicker8k/Flicker8k_images', img)
    image_data = tf.io.read_file(image_path)
    image_data = tf.image.decode_jpeg(image_data, channels=3)
    image_data = tf.image.resize(image_data, [224, 224])
    test_image_data[img] = image_data

In [1]:
# Creating training and validation datasets
train_data = data_generator(train_images, caption_map, train_image_data, BATCH_SIZE)
val_data = data_generator(val_images, caption_map, val_image_data, BATCH_SIZE)

NameError: name 'data_generator' is not defined

## Image Captioning Hypermodel

In [6]:
class MyHyperModel(HyperModel):
    def __init__(self, seq_len, vocab_size, embed_dim, embedding_matrix, image_shape= (224, 224, 3), hidden_dim= 1024, dropout= 0.3, seed = 17):
        super().__init__()
        self.image_shape = image_shape
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.dropout = dropout 
        self.seed = seed
        
        self.cnn_configs = {
            'vgg16': (VGG16, vgg16.preprocess_input),
            'vgg19': (VGG19, vgg19.preprocess_input),
            'resnet50': (ResNet50, resnet.preprocess_input),
            'resnet152': (ResNet152, resnet.preprocess_input),
            'resnet50v2': (ResNet50V2, resnet_v2.preprocess_input),
            'resnet152v2': (ResNet152V2, resnet_v2.preprocess_input),
            'inception': (InceptionV3, inception_v3.preprocess_input),
            'inceptionresnet': (InceptionResNetV2, inception_resnet_v2.preprocess_input),
            'efficientb0': (EfficientNetB0, efficientnet.preprocess_input),
            'efficientb1': (EfficientNetB1, efficientnet.preprocess_input),
            'efficientb2': (EfficientNetB2, efficientnet.preprocess_input),
            'nasnetmobile': (NASNetMobile, nasnet.preprocess_input),
            'densenet121': (DenseNet121, densenet.preprocess_input),
            'densenet201': (DenseNet201, densenet.preprocess_input), 
            'convnexttiny': (ConvNeXtTiny, convnext.preprocess_input),
            'convnextbase': (ConvNeXtBase, convnext.preprocess_input)
        }   
        
        # embedding layer
        self.embedding_layer = Embedding(
            self.vocab_size,
            embed_dim,
            input_length=self.seq_len,
            trainable=False,
            weights=[embedding_matrix],
            name='sequence_embedding'
        )     
                
    def build(self, hp):
        # CNN choice
        cnn_choice = hp.Choice('cnn', list(self.cnn_configs.keys()))
        CNN, preprocess_func = self.cnn_configs[cnn_choice]
        
        feature_extractor = CNN(
            weights='imagenet',
            include_top=False,
            input_shape=self.image_shape,
            pooling='avg'
        )
        feature_extractor.trainable = False
                
        # Image path
        image_input = Input(shape=self.image_shape, name='image_input')
        processed_image = preprocess_func(image_input)
        image_features = feature_extractor(processed_image)        
        featuers_dense = Dense(self.hidden_dim, activation = 'relu', name='image_dense', kernel_regularizer=L2(1e-4))(image_features)

        # Caption path
        caption_input = Input(shape=(self.seq_len,), name='caption_input')
        caption_embedding = self.embedding_layer(caption_input)
        caption_dropout = Dropout(self.dropout, name='embedding_dropout')(caption_embedding)
        lstm = LSTM(self.hidden_dim, return_sequences=True, name='lstm_layer', kernel_regularizer=L2(1e-4))(caption_dropout)        
        
        # Combine image and sequence
        merging_layer = Add()([featuers_dense, lstm])
        merge_dropout = Dropout(self.dropout)(merging_layer)

        # Prediction layers        
        dense_layer = Dense(self.hidden_dim, activation='relu', kernel_regularizer=L2(1e-4))(merge_dropout)
        prediction_dropout = Dropout(self.dropout)(dense_layer)
        output = Dense(self.vocab_size, activation='softmax', kernel_regularizer=L2(1e-4))(prediction_dropout)
        
        decoder = Model(inputs = [image_input, caption_input] , outputs = output)       
        decoder.compile(optimizer= Adam(1e-3), loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])
        
        return decoder


In [7]:
# Initialize hypermodel
hypermodel = MyHyperModel(SEQ_LENGTH, VOCAB_SIZE, EMBED_DIM, embedding_matrix)

# Initialize tuner
tuner = RandomSearch(
    hypermodel,
    objective="val_loss", 
    max_trials=200,  
    directory="keras_tuning",
    project_name="cnn_lstm_architecture_search_v2.2",  
    overwrite=True
)

# Initializing tensorborad callback 
tensorboard = TensorBoard(
    log_dir='./tensorboard_logs/architecture_search_v2.2',
    histogram_freq=1,
    write_graph=True,
    update_freq='epoch'
)

# Searching for optimal solution 
tuner.search(
    train_data,
    epochs = 3,
    steps_per_epoch=STEPS,
    validation_data=val_data,
    validation_steps=VAL_STEPS,
    callbacks= [tensorboard],
    verbose=1
)

Trial 16 Complete [00h 02m 22s]
val_loss: 2.6741182804107666

Best val_loss So Far: 2.597045421600342
Total elapsed time: 00h 54m 44s
Results summary
Results in keras_tuning\cnn_lstm_architecture_search_v2.2
Showing 5 best trials
Objective(name="val_loss", direction="min")

Trial 006 summary
Hyperparameters:
cnn: densenet121
Score: 2.597045421600342

Trial 010 summary
Hyperparameters:
cnn: inception
Score: 2.5986948013305664

Trial 003 summary
Hyperparameters:
cnn: densenet201
Score: 2.603595018386841

Trial 008 summary
Hyperparameters:
cnn: resnet152
Score: 2.6454343795776367

Trial 011 summary
Hyperparameters:
cnn: resnet50
Score: 2.649353504180908


In [8]:
tuner.results_summary(5)

Results summary
Results in keras_tuning\cnn_lstm_architecture_search_v2.2
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 006 summary
Hyperparameters:
cnn: densenet121
Score: 2.597045421600342

Trial 010 summary
Hyperparameters:
cnn: inception
Score: 2.5986948013305664

Trial 003 summary
Hyperparameters:
cnn: densenet201
Score: 2.603595018386841

Trial 008 summary
Hyperparameters:
cnn: resnet152
Score: 2.6454343795776367

Trial 011 summary
Hyperparameters:
cnn: resnet50
Score: 2.649353504180908

Trial 007 summary
Hyperparameters:
cnn: nasnetmobile
Score: 2.6657721996307373

Trial 015 summary
Hyperparameters:
cnn: efficientb1
Score: 2.6741182804107666

Trial 014 summary
Hyperparameters:
cnn: inceptionresnet
Score: 2.678245782852173

Trial 005 summary
Hyperparameters:
cnn: convnextbase
Score: 2.7098047733306885

Trial 004 summary
Hyperparameters:
cnn: efficientb0
Score: 2.711036443710327


## Training outputs 
1. ([0.6405022734089714, 0.4463027082229372, 0.29934462206575563, 0.20871405332039702], 0.4145237453375769) v1
2. ([0.6418743284682035, 0.442693660623719, 0.29415405024196956, 0.20058639500348188], 0.4080181609786798) v2


In [None]:
from functions.model_evaluation import batch_generate_captions, evaluate_captions

predicted_captions, true_captions = batch_generate_captions(val_images, val_feature_map, val_caption_map, idx_to_word, best_decoder, SEQ_LEN, 1024)
evaluate_captions(predicted_captions, true_captions)

- ([0.6211531810024086, 0.410840029257982, 0.26646794915971794, 0.1811533502178472], 0.38122496568020725) batch = 64
- ([0.5993318816719131,
  0.3865770367462274,
  0.24482775805037613,
  0.16440581561248588],
 0.3609836026638695) batch = 32
 - ([0.6344307633588684,
  0.4343686956410259,
  0.28711750048084483,
  0.19787981935572438],
 0.3971223813690533) batch = 256
- ([0.6298820729985007,
  0.4312138237301613,
  0.2866155554360642,
  0.19793620298236123],
 0.39902906753744116) previous with more training 
 - ([0.6428814404792379,
  0.4456909514391105,
  0.29858524736881525,
  0.2072303796603672],
 0.4091058296690576) batch = 256, 20 epochs, model saved
 - ([0.6428814404792379,
  0.4456909514391105,
  0.29858524736881525,
  0.2072303796603672],
 0.4091058296690576) previous model with more training 
 - ([0.6390607878281732,
  0.4406290889734613,
  0.2962323225086467,
  0.2063178068601112],
 0.40974129391739644) changed hid dim to 512
 - ([0.6568510950992849,
  0.46559990385495154,
  0.3192417928203884,
  0.22491635004582852],
 0.42799321255907147) added dense layer after concat 

In [None]:
# decoder.save('models/lstm_decoder_for_cnn_encoder.keras')