In [1]:
import os
import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add, Bidirectional, Attention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Define directories
BASE_DIR = r'C:\Users\RippleNova\img_VGG16'
IMAGES_DIR = r'C:\Users\RippleNova\img_VGG16\dataset\ficker8k_images'
CAPTIONS_FILE = r'C:\Users\RippleNova\img_VGG16\dataset\captions.txt'
WORKING_DIR = r'C:\Users\RippleNova\img_VGG16\model'

In [3]:
# VGG16 feature extraction
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

In [6]:
# Extract features
features = {}
for img_name in os.listdir(IMAGES_DIR):
    img_path = os.path.join(IMAGES_DIR, img_name)
    image = load_img(img_path, target_size=(224,224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    image_id = img_name.split('.')[0]
    features[image_id] = feature

In [7]:
# Load and preprocess captions
captions_file_path = CAPTIONS_FILE
with open(captions_file_path, 'r', encoding='utf-8') as file:
    captions_doc = file.read()

In [8]:
# Map captions to image IDs
mapping = {}
for line in captions_doc.split('\n'):
    tokens = line.split('.')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = ' '.join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

In [9]:
# Cleaning captions
import re

def clean(mapping):
    for key, captions_list in mapping.items():
        for i in range(len(captions_list)):
            caption = captions_list[i]
            caption = caption.lower()
            caption = re.sub(u'[\u0964\u0951\u0966-\u096F]+', '', caption)
            caption = re.sub(r'[0-9]', '', caption)
            caption = re.sub(r'\s+', ' ', caption).strip()
            caption = 'startseq ' + caption + ' endseq'
            captions_list[i] = caption

clean(mapping)

In [10]:
# Tokenize captions
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(caption.split()) for caption in all_captions)

In [11]:
# Split data into training and test sets
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.80)
train = image_ids[:split]
test = image_ids[split:]

In [14]:
# Embedding layer (train from scratch)
embedding_dim = 256

# Define the embedding layer (learned from scratch)
embedding_layer = Embedding(input_dim=vocab_size, 
                            output_dim=embedding_dim, 
                            input_length=max_length, 
                            trainable=True)

In [34]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

In [38]:
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, LSTM, Bidirectional, Attention, add, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Define model parameters
max_length = 32
vocab_size = 13990

# Encoder (Image features)
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(fe1)

# Decoder (Text features)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True, trainable=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = Bidirectional(LSTM(512, return_sequences=True))(se2)

# Attention layer
attention = Attention()([se3, se3])

# Apply GlobalAveragePooling1D
attention = GlobalAveragePooling1D()(attention)

# Match dimensions of attention to fe2
attention = Dense(512, activation='relu')(attention)

# Combine features
decoder1 = add([fe2, attention])
decoder2 = Dense(512, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=1e-5))



In [39]:
print(model.summary())

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_21 (InputLayer)       [(None, 32)]                 0         []                            
                                                                                                  
 embedding_10 (Embedding)    (None, 32, 256)              3581440   ['input_21[0][0]']            
                                                                                                  
 dropout_19 (Dropout)        (None, 32, 256)              0         ['embedding_10[0][0]']        
                                                                                                  
 bidirectional_9 (Bidirecti  (None, 32, 1024)             3149824   ['dropout_19[0][0]']          
 onal)                                                                                      

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

# Training parameters
epochs = 30
batch_size = 32

# Calculate number of steps per epoch
steps = len(train) // batch_size  # Integer division

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1, callbacks=[early_stopping])


Epoch 1/30
 35/202 [====>.........................] - ETA: 2:07:28 - loss: 18.4161

In [None]:
# Save model
model.save(WORKING_DIR + '/model_with_attention.h5')

In [None]:
# Evaluate BLEU scores
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
smooth = SmoothingFunction().method4
actual, predicted = list(), list()

for key in test:
    captions = mapping[key]
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    actual.append(actual_captions)
    predicted.append(y_pred)

# BLEU scores
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0), smoothing_function=smooth))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth))
print("BLEU-3: %f" % corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth))
print("BLEU-4: %f" % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth))