In [None]:
import os
import pickle
import numpy as np
import tensorflow as tf
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'

In [None]:
# VGG16 model
image_model = VGG16()
image_model = Model(inputs=image_model.inputs, outputs=image_model.layers[-2].output)

In [None]:
# Extract image features
image_features = {}
image_directory = os.path.join(BASE_DIR, 'Images')

for image_name in tqdm(os.listdir(image_directory)):
   # Load and preprocess the image
   image_path = os.path.join(image_directory, image_name)
   image = load_img(image_path, target_size=(224, 224))
   image = img_to_array(image)
   image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
   image = preprocess_input(image)

   # Extract features using the image model
   image_feature = image_model.predict(image, verbose=0)

   # Store the feature with the image ID
   image_id = image_name.split('.')[0]
   image_features[image_id] = image_feature

In [None]:
# Store image features in a pickle file
pickle.dump(image_features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [None]:
# load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [None]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# Create mapping of image IDs to captions
image_captions = {}
for line in tqdm(captions_doc.split('\n')):
   if len(line) < 2:
       continue
   tokens = line.split(',')
   image_id = tokens[0].split('.')[0]
   caption = " ".join(tokens[1:])
   if image_id not in image_captions:
       image_captions[image_id] = []
   image_captions[image_id].append(caption)

In [None]:
print(f"Total number of images in dataset: {len(image_captions)}")

In [None]:
def clean_captions(image_captions):
   for captions in image_captions.values():
       for i, caption in enumerate(captions):
           caption = caption.lower()
           caption = caption.replace('[^A-Za-z]', '')
           caption = caption.replace('\s+', ' ')
           caption = 'startseq ' + " ".join([word for word in caption.split() if len(word) > 1]) + ' endseq'
           captions[i] = caption

In [None]:
#caption before cleaning
image_captions['1000268201_693b08cb0e']

In [None]:
clean_captions(image_captions)

In [None]:
image_captions['1000268201_693b08cb0e']

In [None]:
all_captions = []
for captions in image_captions.values():
   all_captions.extend(captions)

In [None]:
print(f"Total number of preprocessed captions: {len(all_captions)}")

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
#prints the length of longest caption
max_length = max(len(caption.split()) for caption in all_captions)
max_length

In [None]:
#split into test and train dataset
image_ids = list(image_captions.keys())
split = int(len(image_ids) * 0.9)
train_ids = image_ids[:split]
test_ids = image_ids[split:]

In [None]:
def data_generator(data_keys, image_captions, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = [], [], []
    
    while True:
        for key in data_keys:
            for caption in image_captions[key]:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    # Right-pad the sequence
                    in_seq = tf.keras.preprocessing.sequence.pad_sequences(
                        [in_seq], 
                        maxlen=max_length,
                        padding='post',
                        truncating='post'
                    )[0]
                    out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
                    
                    if len(X1) == batch_size:
                        X1_array = tf.convert_to_tensor(np.array(X1), dtype=tf.float32)
                        X2_array = tf.convert_to_tensor(np.array(X2), dtype=tf.float32)
                        y_array = tf.convert_to_tensor(np.array(y), dtype=tf.float32)
                        
                        yield (X1_array, X2_array), y_array
                        
                        X1, X2, y = [], [], []

In [None]:
# Image encoding model
image_input = Input(shape=(4096,))
encoded_image = Dropout(0.4)(image_input)
encoded_image = Dense(256, activation='relu')(encoded_image)

# Caption encoding model 
caption_input = Input(shape=(max_length,))
caption_embedding = Embedding(vocab_size, 256, mask_zero=True)(caption_input)
caption_dropout = Dropout(0.4)(caption_embedding)
encoded_caption = LSTM(256)(caption_dropout)

# Decoder model
merged_features = add([encoded_image, encoded_caption])
decoder_dense = Dense(256, activation='relu')(merged_features)
output_layer = Dense(vocab_size, activation='softmax')(decoder_dense)

# Compile the model
caption_generator = Model(inputs=[image_input, caption_input], outputs=output_layer)
caption_generator.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
epochs = 25
batch_size = 64
steps = len(train_ids) // batch_size

feature_shape = next(iter(image_features.values()))[0].shape

# Configure the dataset
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_ids, image_captions, image_features, tokenizer, max_length, vocab_size, batch_size),
    output_signature=(
        (
            tf.TensorSpec(shape=(batch_size, *feature_shape), dtype=tf.float32),
            tf.TensorSpec(shape=(batch_size, max_length), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(batch_size, vocab_size), dtype=tf.float32)
    )
)

for layer in caption_generator.layers:
    if isinstance(layer, tf.keras.layers.LSTM):
        layer.use_cudnn = False

caption_generator.fit(dataset, epochs=epochs, steps_per_epoch=steps, verbose=1)

In [None]:
caption_generator.save(os.path.join(WORKING_DIR, 'best_model.h5'))

In [None]:
def idx_to_word(index, tokenizer):
   return next((word for word, i in tokenizer.word_index.items() if i == index), None)

In [None]:
def predict_caption(model, image, tokenizer, max_length):
   caption = 'startseq'
   for _ in range(max_length):
       sequence = tokenizer.texts_to_sequences([caption])[0]
       sequence = pad_sequences([sequence], max_length)
       yhat = model.predict([image, sequence], verbose=0)
       yhat = np.argmax(yhat)
       word = idx_to_word(yhat, tokenizer)
       if word is None or word == 'endseq':
           break
       caption += f" {word}"
   return caption

In [None]:
from nltk.translate.bleu_score import corpus_bleu

actual, predicted = [], []
for image_id in tqdm(test_ids):
   captions = image_captions[image_id]
   caption_prediction = predict_caption(caption_generator, image_features[image_id], tokenizer, max_length)
   actual.append([caption.split() for caption in captions])
   predicted.append(caption_prediction.split())

print(f"BLEU-1: {corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)):.4f}")
print(f"BLEU-2: {corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)):.4f}")

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

def generate_caption(image_name):
   image_id = image_name.split('.')[0]
   image_path = os.path.join(BASE_DIR, 'Images', image_name)
   image = Image.open(image_path)
   
   print('---Actual-Caption---')
   for caption in image_captions[image_id]:
       print(caption)
   
   predicted_caption = predict_caption(caption_generator, image_features[image_id], tokenizer, max_length)
   print('---Predicted-Caption---')
   print(predicted_caption)
   
   plt.imshow(image)

In [None]:
generate_caption("1009434119_febe49276a.jpg")

In [None]:
generate_caption("111497985_38e9f88856.jpg")