In [None]:
import tensorflow as tf
import numpy as np
import random
import datetime

from pickle import load
from math import ceil
from numpy.random import choice
from keras import Model
from keras.applications import ResNet152V2
from keras.layers import Input, Dense, LSTM, Embedding, Add, Dropout
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping, TensorBoard
import matplotlib.pyplot as plt

from functions.text_processing import create_vocab_mappings
from functions.training import data_generator
from functions.model_evaluation import evaluate_captions, generate_caption
from functions.testing import generate_and_evaluate_test_caption
from keras.utils import plot_model

from keras_tuner import HyperParameters, Hyperband

from keras.models import Model
from keras.layers import Input, Dense, LSTM, Dropout, Embedding, Add, LayerNormalization, GRU, Bidirectional
from keras.optimizers import Adam, RMSprop
from keras.losses import SparseCategoricalCrossentropy
from keras.regularizers import L1, L2, L1L2
from keras_tuner import HyperModel, HyperParameters

from keras.initializers import HeUniform, Orthogonal, GlorotUniform




In [2]:
SEQ_LEN = 15
FEATURE_SIZE = 2048
EMB_DIM = 300 # can only be values 100, 200, 300
HIDDEN_DIM = 512
EPOCHS = 100
DROPOUT = 0.4
BATCH_SIZE = 512
EVAL_BATCH_SIZE = 1024
DIR = 'preprocessed_data/coco/'

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [3]:
with open(DIR + 'train_caption_map.pkl', 'rb') as file:
    train_caption_map = load(file)
    
with open(DIR + 'train_feature_map.pkl', 'rb') as file: 
    train_feature_map = load(file)

with open(DIR + 'val_caption_map.pkl', 'rb') as file:
    val_caption_map = load(file)
    
with open(DIR + 'val_feature_map.pkl', 'rb') as file:
    val_feature_map = load(file)

with open(DIR + 'embedding_matrix.pkl', 'rb') as file:
    embedding_matrix = load(file)
    
with open(DIR + 'vocab.pkl', 'rb') as file:
    vocab = load(file)
    
train_images = list(train_caption_map.keys())
val_images = list(val_caption_map.keys())

VOCAB_SIZE = len(vocab)
STEPS = ceil(len(train_images) * 5 / BATCH_SIZE)
VAL_STEPS = ceil(len(val_images) * 5 / BATCH_SIZE)

word_to_idx, idx_to_word = create_vocab_mappings(vocab)

train_data = data_generator (train_images, train_caption_map, train_feature_map, BATCH_SIZE)
val_data = data_generator(val_images, val_caption_map, val_feature_map, BATCH_SIZE)

# Model 

In [8]:
class MyHyperModel(HyperModel):
    def __init__(self, feature_size = 1536, seq_len= 15, vocab_size= 4096, hidden_dim= 1024, seed = 17):
        super().__init__()
        self.feature_size = feature_size
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.seed = seed
        
    def build(self, hp):       
        tf.random.set_seed(self.seed)
        
        dropout_rate = hp.Float('dropout', min_value=0, max_value=0.5, step=0.1)
        reg_type = hp.Choice('regularizer_type', values=['none', 'l2'])
        reg_factor = hp.Choice('regularization_factor', values=[1e-6, 1e-4, 1e-2]) 
        use_layer_norm = hp.Boolean('use_layer_norm')
            
        regularizer = L2(reg_factor) if reg_type == 'l2' else None

        optimizer_choice = hp.Choice('optimizer', values=['adam', 'rmsprop'])
        learning_rate = hp.Choice('learning_rate', values=[1e-5, 1e-4, 1e-3, 1e-2])

        # Embedding layer
        embedding_layer = Embedding(self.vocab_size, self.embedding_matrix.shape[1], input_length=self.seq_len, trainable=False, weights=[self.embedding_matrix], name='sequence_embedding')

        # Image layers
        image_input = Input(shape=(self.feature_size), name='image_input')
        image_dropout = Dropout(dropout_rate, name='image_dropout')(image_input)
        x_img = Dense(self.hidden_dim, activation = 'relu', kernel_initializer= HeUniform(), kernel_regularizer=regularizer, name='image_dense')(image_dropout)    
        if use_layer_norm:
            x_img = LayerNormalization()(x_img) 

        # Sequence layers
        caption_input = Input(shape=(self.seq_len,), name='caption_input')
        caption_embedding = embedding_layer(caption_input)
        caption_dropout = Dropout(dropout_rate, name='embedding_dropout')(caption_embedding)
        
        x_caption = LSTM(self.hidden_dim, return_sequences=True, kernel_initializer= Orthogonal(), kernel_regularizer=regularizer, recurrent_regularizer=regularizer, name='lstm_layer')(caption_dropout)
        if use_layer_norm:
            x_caption = LayerNormalization()(x_caption)

        # Combine image and sequence
        merging_layer = Add()([x_img, x_caption])
        merge_dropout = Dropout(dropout_rate)(merging_layer)

        # Prediction layers        
        x_pred = Dense(self.hidden_dim, activation='relu', kernel_initializer= HeUniform(), kernel_regularizer=regularizer)(merge_dropout)
        if use_layer_norm:
            x_pred = LayerNormalization()(x_pred)
        prediction_dropout = Dropout(dropout_rate)(x_pred)
        
        output = Dense(self.vocab_size, activation='softmax', kernel_initializer= GlorotUniform(), kernel_regularizer=regularizer)(prediction_dropout)
        
        decoder = Model(inputs = [image_input, caption_input] , outputs = output)       
        
        optimizer = Adam(learning_rate) if optimizer_choice == 'adam' else RMSprop(learning_rate)
        decoder.compile(optimizer= optimizer, loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])
        
        return decoder


In [None]:
hypermodel = MyHyperModel(
    feature_size=FEATURE_SIZE,
    seq_len=SEQ_LEN,
    vocab_size=VOCAB_SIZE,
)

tuner = Hyperband(
    hypermodel= hypermodel,
    executions_per_trial = 3,
    objective='val_loss',
    directory = 'tuning',
    project_name = 'lstm_decoder',
)

tuner.search_space_summary()


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

tuner.search(
    train_data,
    epochs = 1,
    steps_per_epoch=STEPS,
    validation_data=val_data,
    validation_steps=VAL_STEPS,
    callbacks= [early_stopping],
    verbose=1
)

In [12]:
best_decoders = tuner.get_best_models()

In [13]:
best_decoder = best_decoders[0]

In [None]:
# Get the top 2 hyperparameters.
best_hps = tuner.get_best_hyperparameters(5)
# Build the model with the best hp.
model = hypermodel.build(best_hps[0])
# Fit with the entire dataset.

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    TensorBoard(
            log_dir='./logs/lstm_decoder/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
            histogram_freq=1,
            write_graph=True,
            update_freq='epoch'
        )
    ]

training = model.fit(
    train_data,
    epochs=100,
    steps_per_epoch=STEPS,
    validation_data=val_data,
    validation_steps=VAL_STEPS,
    callbacks= callbacks,
    verbose=1
)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8))

# Plotting Loss
axes[0].plot(training.history['loss'], marker='o', color='b', label='Training Loss')
axes[0].plot(training.history['val_loss'], marker='o', color='r', label='Validation Loss')
axes[0].set_title('Model Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('Loss')
axes[0].legend()

# Plotting Accuracy
axes[1].plot(training.history['accuracy'], marker='s', color='g', label='Training Accuracy')
axes[1].plot(training.history['val_accuracy'], marker='s', color='orange', label='Validation Accuracy')
axes[1].set_title('Model Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
%tensorboard --logdir logs

In [None]:
test_image = choice(val_images)
test_features = val_feature_map[test_image]
true_captions = val_caption_map[test_image]

generate_and_evaluate_test_caption('datasets/coco/val2014/', test_image, test_features, idx_to_word, model, SEQ_LEN, true_captions) 

In [None]:
tuner.results_summary()

## Training outputs 
1. ([0.6405022734089714, 0.4463027082229372, 0.29934462206575563, 0.20871405332039702], 0.4145237453375769) v1
2. ([0.6418743284682035, 0.442693660623719, 0.29415405024196956, 0.20058639500348188], 0.4080181609786798) v2


In [None]:
from functions.model_evaluation import batch_generate_captions, evaluate_captions

predicted_captions, true_captions = batch_generate_captions(val_images, val_feature_map, val_caption_map, idx_to_word, model, SEQ_LEN, 1024)
evaluate_captions(predicted_captions, true_captions)

- ([0.6211531810024086, 0.410840029257982, 0.26646794915971794, 0.1811533502178472], 0.38122496568020725) batch = 64
- ([0.5993318816719131,
  0.3865770367462274,
  0.24482775805037613,
  0.16440581561248588],
 0.3609836026638695) batch = 32
 - ([0.6344307633588684,
  0.4343686956410259,
  0.28711750048084483,
  0.19787981935572438],
 0.3971223813690533) batch = 256
- ([0.6298820729985007,
  0.4312138237301613,
  0.2866155554360642,
  0.19793620298236123],
 0.39902906753744116) previous with more training 
 - ([0.6428814404792379,
  0.4456909514391105,
  0.29858524736881525,
  0.2072303796603672],
 0.4091058296690576) batch = 256, 20 epochs, model saved
 - ([0.6428814404792379,
  0.4456909514391105,
  0.29858524736881525,
  0.2072303796603672],
 0.4091058296690576) previous model with more training 
 - ([0.6390607878281732,
  0.4406290889734613,
  0.2962323225086467,
  0.2063178068601112],
 0.40974129391739644) changed hid dim to 512
 - ([0.6568510950992849,
  0.46559990385495154,
  0.3192417928203884,
  0.22491635004582852],
 0.42799321255907147) added dense layer after concat 
 - ([0.6592448939264582,
  0.4687748184498835,
  0.3229394024298345,
  0.22790701045314818],
 0.43095061728517275) new cnn
 - ([0.6587324526306322,
  0.4695537527436097,
  0.32331068826761733,
  0.22825250420482918],
 0.4321113122628307) new cnn training

In [None]:
# decoder.save('models/lstm_decoder_for_cnn_encoder.keras')