In [21]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def extract_image_features(image_folder, model):
    features = {}
    for img_name in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_name)
        image = load_img(img_path, target_size=(224, 224))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        feature = model.predict(image, verbose=0)
        features[img_name] = feature.flatten()
    return features

# Load pre-trained VGG16 model
cnn_model = VGG16(weights="imagenet", include_top=False, pooling="avg")

# Extract features
image_folder = r"images"
image_features = extract_image_features(image_folder, cnn_model)


In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer

captions_dict = {
    "img1.jpg": ["A dog running in the field.", "A dog is playing outside."],
    "img2.jpg": ["A girl on a swing.", "A child enjoying a swing in the park."],
    # "img3.jpg": ["A man riding a bicycle on a city street.", "A cyclist passing through an urban area."],
    # "img4.jpg": ["A cat sitting on a wooden fence.", "A furry cat resting on a backyard fence."],
    # "img5.jpg": ["A group of people hiking in the mountains.", "Hikers enjoying a scenic mountain trail."],
    # "img6.jpg": ["A plane flying in a clear blue sky.", "An airplane soaring high above the clouds."],
    # "img7.jpg": ["A bowl of fresh fruit on a wooden table.", "An assortment of fruits arranged in a bowl."],
    # "img8.jpg": ["A boy playing with a soccer ball in the park.", "A child kicking a football outdoors."],
    # "img9.jpg": ["A beach with waves crashing on the shore.", "The ocean meeting the sandy beach under a bright sky."],
    # "img10.jpg": ["A couple enjoying a romantic dinner by candlelight.", "Two people dining with a cozy ambiance."],
    # "img11.jpg": ["A city skyline during sunset.", "The sun setting behind tall skyscrapers."],
    # "img12.jpg": ["A horse grazing in a grassy field.", "A brown horse eating grass in a meadow."],
}

# Tokenize captions
tokenizer = Tokenizer()
captions_list = [item for sublist in captions_dict.values() for item in sublist]
tokenizer.fit_on_texts(captions_list)
vocab_size = len(tokenizer.word_index) + 1


In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def create_sequences(tokenizer, max_length, desc, photo_features, vocab_size):
    X1, X2, y = [], [], []
    seq = tokenizer.texts_to_sequences([desc])[0]
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X1.append(photo_features)
        X2.append(in_seq)
        y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

max_length = 35
X1, X2, y = [], [], []
for img_name, captions in captions_dict.items():
    for caption in captions:
        if img_name in image_features:
            in_img, in_seq, out_word = create_sequences(
                tokenizer, max_length, caption, image_features[img_name], vocab_size
            )
            X1.append(in_img)
            X2.append(in_seq)
            y.append(out_word)

X1, X2, y = np.vstack(X1), np.vstack(X2), np.vstack(y)


In [24]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, LSTM, add

def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(512,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation="relu")(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    return model

model = define_model(vocab_size, max_length)


In [25]:
model.fit([X1, X2], y, epochs=20, verbose=2)


Epoch 1/20
1/1 - 9s - loss: 11.3996 - 9s/epoch - 9s/step
Epoch 2/20
1/1 - 0s - loss: 8.1842 - 104ms/epoch - 104ms/step
Epoch 3/20
1/1 - 0s - loss: 7.0386 - 105ms/epoch - 105ms/step
Epoch 4/20
1/1 - 0s - loss: 8.0407 - 100ms/epoch - 100ms/step
Epoch 5/20
1/1 - 0s - loss: 6.4132 - 96ms/epoch - 96ms/step
Epoch 6/20
1/1 - 0s - loss: 8.6730 - 98ms/epoch - 98ms/step
Epoch 7/20
1/1 - 0s - loss: 6.3474 - 94ms/epoch - 94ms/step
Epoch 8/20
1/1 - 0s - loss: 8.6556 - 97ms/epoch - 97ms/step
Epoch 9/20
1/1 - 0s - loss: 6.5646 - 105ms/epoch - 105ms/step
Epoch 10/20
1/1 - 0s - loss: 5.9771 - 101ms/epoch - 101ms/step
Epoch 11/20
1/1 - 0s - loss: 4.9481 - 93ms/epoch - 93ms/step
Epoch 12/20
1/1 - 0s - loss: 5.5328 - 99ms/epoch - 99ms/step
Epoch 13/20
1/1 - 0s - loss: 5.5039 - 100ms/epoch - 100ms/step
Epoch 14/20
1/1 - 0s - loss: 6.5969 - 103ms/epoch - 103ms/step
Epoch 15/20
1/1 - 0s - loss: 4.8293 - 102ms/epoch - 102ms/step
Epoch 16/20
1/1 - 0s - loss: 6.0918 - 97ms/epoch - 97ms/step
Epoch 17/20
1/1 - 0s

<keras.src.callbacks.History at 0x12ac11b0d30>

In [26]:
def generate_caption(model, tokenizer, photo_features, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += " " + word
        if word == "endseq":
            break
    return in_text

In [27]:
def extract_single_image_features(img_path, model):
    image = load_img(img_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.vgg16.preprocess_input(image)
    feature = model.predict(image, verbose=0)
    return feature.flatten()

test_image = r"images\img1.jpg"
test_features = extract_single_image_features(test_image, cnn_model)
generated_caption = generate_caption(model, tokenizer, test_features.reshape(1, -1), max_length)
print("Generated Caption:", generated_caption)

Generated Caption: startseq running running running running running running running running running running running running running running running running running running running running running running running running running running running running running running running running running running running


In [28]:
from nltk.translate.bleu_score import sentence_bleu

reference_captions = [
    ["a", "dog", "running", "in", "the", "field"],
    ["a", "dog", "is", "playing", "outside"]
]
generated_caption_words = generated_caption.split()[1:-1]  # Exclude 'startseq' and 'endseq'
bleu_score = sentence_bleu(reference_captions, generated_caption_words)
print("BLEU Score:", bleu_score)


BLEU Score: 0.41412387656655203


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
