In [1]:
# !pip install keras-tuner

In [2]:
import os
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers
from official.nlp import optimization
import keras_tuner as kt


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [3]:
# Define ITM_DataLoader to load the data
class ITM_DataLoader:
    BATCH_SIZE = 16
    IMAGE_SIZE = (224, 224)
    IMAGE_SHAPE = (224, 224, 3)
    SENTENCE_EMBEDDING_SHAPE = 384
    AUTOTUNE = tf.data.AUTOTUNE
    DATA_PATH = r'D:\_GITHUB_\Image-Text-Matching\data' 
    IMAGES_PATH = DATA_PATH + "\images"
    
    def __init__(self):
        self.train_data_file = self.DATA_PATH + r"\flickr8k.TrainImages.txt"
        self.dev_data_file = self.DATA_PATH + r"\flickr8k.DevImages.txt"
        self.test_data_file = self.DATA_PATH + r"\flickr8k.TestImages.txt"
        self.sentence_embeddings_file = self.DATA_PATH + r"\flickr8k.cmp9137.sentence_transformers.pkl"
        self.sentence_embeddings = self.load_sentence_embeddings()
        self.train_ds = self.load_classifier_data(self.train_data_file)
        self.val_ds = self.load_classifier_data(self.dev_data_file)
        self.test_ds = self.load_classifier_data(self.test_data_file)

    def load_sentence_embeddings(self):
        with open(self.sentence_embeddings_file, "rb") as f:
            return pickle.load(f)

    def process_input(self, img_path, dense_vector, text, label):
        img = tf.io.read_file(img_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, self.IMAGE_SIZE)
        img = tf.image.convert_image_dtype(img, tf.float32)
        img = tf.cast(img, tf.float32) / 255
        features = {}
        features["image_input"] = img
        features["text_embedding"] = dense_vector
        features["caption"] = text
        features["file_name"] = img_path
        return features, label

    def load_classifier_data(self, data_files):
        print("LOADING data from " + str(data_files))
        print("=========================================")
        image_paths = []
        embeddings_data = []
        labels = []

        # get image, text, label of image_files
        with open(data_files) as f:
            lines = f.readlines()
            for line in lines:
                line = line.rstrip("\n")
                img_name, text, raw_label = line.split("\t")
                img_path = os.path.join(self.IMAGES_PATH, img_name.strip())

                # get binary labels from match/no-match answers
                label = [1, 0] if raw_label == "match" else [0, 1]

                # get sentence embeddings (of textual captions)
                if text in self.sentence_embeddings:
                    text_embedding = self.sentence_embeddings[text]
                else:
                    print(f"Warning: No embedding found for text '{text}'. Skipping.")
                    continue

                image_paths.append(img_path)
                embeddings_data.append(text_embedding)
                labels.append(label)

        print(f"Loaded {len(image_paths)} data samples.")

        def parse_function(img_path, embedding, label):
            img = tf.io.read_file(img_path)
            img = tf.image.decode_jpeg(img, channels=3)
            img = tf.image.resize(img, [224, 224])  # Resize images if needed
            img = img / 255.0  # Normalize pixel values
            return {'image_input': img, 'text_embedding': embedding}, label

        # Create a TensorFlow dataset
        dataset = tf.data.Dataset.from_tensor_slices((image_paths, embeddings_data, labels))
        dataset = dataset.map(parse_function, num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.batch(self.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

        return dataset

In [4]:
# Function to create the ITM model with dynamic hyperparameters
def build_itm_model(hp):
    # Hyperparameters
    num_projection_layers = hp.Int('num_projection_layers', min_value=1, max_value=3, step=1)
    projection_dims = hp.Int('projection_dims', min_value=64, max_value=256, step=64)
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    learning_rate = hp.Float('learning_rate', min_value=1e-5, max_value=1e-3, sampling='log')

    # Image Input
    img_input = keras.Input(shape=(224, 224, 3), name="image_input")
    # Simple CNN for image encoding
    x = layers.Conv2D(32, 3, activation='relu')(img_input)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(64, 3, activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Flatten()(x)
    vision_encoded = layers.Dense(projection_dims, activation='relu')(x)

    # Text Input
    text_input = keras.Input(shape=(384,), name="text_embedding")  # Assuming 384 is the size of the sentence embedding
    text_encoded = layers.Dense(projection_dims, activation='relu')(text_input)

    # Projection and Concatenation
    for _ in range(num_projection_layers):
        vision_encoded = layers.Dense(projection_dims, activation='relu')(vision_encoded)
        vision_encoded = layers.Dropout(dropout_rate)(vision_encoded)
        text_encoded = layers.Dense(projection_dims, activation='relu')(text_encoded)
        text_encoded = layers.Dropout(dropout_rate)(text_encoded)

    concatenated = layers.Concatenate()([vision_encoded, text_encoded])

    # Classifier head
    output = layers.Dense(2, activation='softmax')(concatenated)  # Assuming binary classification (match, no match)

    model = keras.Model(inputs=[img_input, text_input], outputs=output)

    # Compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

In [5]:
# Function to run Hyperband optimization
def run_hyperband_optimization(train_ds, val_ds):
    tuner = kt.Hyperband(build_itm_model,
                         objective='val_accuracy',
                         max_epochs=10,
                         directory='hyperband_optimization',
                         project_name='itm_hyperopt')

    tuner.search(train_ds, validation_data=val_ds, epochs=10, 
                 callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print(f"Best hyperparameters: {best_hps.values}")

    model = tuner.hypermodel.build(best_hps)
    history = model.fit(train_ds, validation_data=val_ds, epochs=50)
    return model, history

In [6]:
# Main execution
data_loader = ITM_DataLoader()
print("Training dataset:", data_loader.train_ds)
print("Validation dataset:", data_loader.val_ds)

LOADING data from D:\_GITHUB_\Image-Text-Matching\data\flickr8k.TrainImages.txt
Loaded 19386 data samples.
LOADING data from D:\_GITHUB_\Image-Text-Matching\data\flickr8k.DevImages.txt
Loaded 1164 data samples.
LOADING data from D:\_GITHUB_\Image-Text-Matching\data\flickr8k.TestImages.txt
Loaded 1161 data samples.
Training dataset: <PrefetchDataset element_spec=({'image_input': TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), 'text_embedding': TensorSpec(shape=(None, 384), dtype=tf.float32, name=None)}, TensorSpec(shape=(None, 2), dtype=tf.int32, name=None))>
Validation dataset: <PrefetchDataset element_spec=({'image_input': TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), 'text_embedding': TensorSpec(shape=(None, 384), dtype=tf.float32, name=None)}, TensorSpec(shape=(None, 2), dtype=tf.int32, name=None))>


In [7]:
if __name__ == "__main__":
    best_model, history = run_hyperband_optimization(data_loader.train_ds, data_loader.val_ds)
    
    # Evaluate the best model
    loss, accuracy = best_model.evaluate(data_loader.test_ds)
    print(f"Final model accuracy: {accuracy}, Loss: {loss}")

Reloading Tuner from hyperband_optimization\itm_hyperopt\tuner0.json

Search: Running Trial #5

Value             |Best Value So Far |Hyperparameter
2                 |2                 |num_projection_layers
192               |64                |projection_dims
0.2               |0.3               |dropout_rate
1.3425e-05        |0.00016316        |learning_rate
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/2
 111/1212 [=>............................] - ETA: 9:50 - loss: 0.6961 - accuracy: 0.4994