In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from nltk.translate.bleu_score import sentence_bleu
import os

# Parameters
vocab_size = 10000  # Adjust based on your dataset
max_length = 35     # Set to a reasonable max length for captions

def extract_image_features(image_folder, model):
    features = {}
    for img_name in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_name)
        image = load_img(img_path, target_size=(224, 224))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        feature = model.predict(image, verbose=0)
        features[img_name] = feature.flatten()
    return features

# Load the pre-trained VGG16 model without top layers
cnn_model = VGG16(weights="imagenet", include_top=False, pooling="avg")

# Extract image features
image_folder = r"img" # Replace with the path to your image folder
image_features = extract_image_features(image_folder, cnn_model)

# Example captions dataset (load from file or define manually)
captions_dict = {
    "img1.jpeg": ["A dog running in the field.", "A dog is playing outside."],
    "img2.jpeg": ["A girl on a swing.", "A child enjoying a swing in the park."],
    # Add more image-caption pairs here...
}

# Prepare captions data
captions_list = list(captions_dict.values())
# Tokenize captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions_list)

# Prepare training sequences
def create_sequences(tokenizer, max_length, desc, photo_features, vocab_size):
    X1, X2, y = [], [], []
    seq = tokenizer.texts_to_sequences([desc])[0]
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X1.append(photo_features)
        X2.append(in_seq)
        y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Prepare data for training
X1, X2, y = [], [], []
for img_name, caption in captions_dict.items():
    if img_name in image_features:
        in_img, in_seq, out_word = create_sequences(
            tokenizer, max_length, caption, image_features[img_name], vocab_size
        )
        X1.append(in_img)
        X2.append(in_seq)
        y.append(out_word)

X1, X2, y = np.vstack(X1), np.vstack(X2), np.vstack(y)

# Model definition
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(512,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation="relu")(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    return model

# Initialize model
model = define_model(vocab_size, max_length)

# Train model
model.fit([X1, X2], y, epochs=20, verbose=2)

# Caption generation
def generate_caption(model, tokenizer, photo_features, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += " " + word
        if word == "endseq":
            break
    return in_text

# Test with a new image
def extract_single_image_features(img_path, model):
    image = load_img(img_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.vgg16.preprocess_input(image)
    feature = model.predict(image, verbose=0)
    return feature.flatten()
test_image = r"img\test.jpeg" # Replace wteith the path to a test image
test_features = extract_single_image_features(test_image, cnn_model)
generated_caption=generate_caption(model, tokenizer, test_features.reshape(1, -1), max_length)
generated_caption=generated_caption.split('.')
print("Generated Caption:", generated_caption[1])
reference_captions = [
    ["a", "dog", "running", "in", "the", "field"],
    ["a", "dog", "is", "playing", "outside"]
]
  # Replace with actual reference captions for test image
generated_caption_words = generated_caption[1].split() 
print(generated_caption_words) # Remove 'startseq' and 'endseq' tokens if present
bleu_score = sentence_bleu(reference_captions, generated_caption_words)
print("BLEU Score:", bleu_score)

Epoch 1/20
1/1 - 8s - loss: 10.7247 - 8s/epoch - 8s/step
Epoch 2/20
1/1 - 0s - loss: 8.2840 - 122ms/epoch - 122ms/step
Epoch 3/20
1/1 - 0s - loss: 5.4325 - 114ms/epoch - 114ms/step
Epoch 4/20
1/1 - 0s - loss: 3.4694 - 110ms/epoch - 110ms/step
Epoch 5/20
1/1 - 0s - loss: 0.9020 - 110ms/epoch - 110ms/step
Epoch 6/20
1/1 - 0s - loss: 0.1790 - 109ms/epoch - 109ms/step
Epoch 7/20
1/1 - 0s - loss: 0.1708 - 106ms/epoch - 106ms/step
Epoch 8/20
1/1 - 0s - loss: 0.0214 - 107ms/epoch - 107ms/step
Epoch 9/20
1/1 - 0s - loss: 0.0017 - 104ms/epoch - 104ms/step
Epoch 10/20
1/1 - 0s - loss: 0.0082 - 109ms/epoch - 109ms/step
Epoch 11/20
1/1 - 0s - loss: 0.0047 - 103ms/epoch - 103ms/step
Epoch 12/20
1/1 - 0s - loss: 0.0020 - 108ms/epoch - 108ms/step
Epoch 13/20
1/1 - 0s - loss: 0.1092 - 105ms/epoch - 105ms/step
Epoch 14/20
1/1 - 0s - loss: 0.0094 - 107ms/epoch - 107ms/step
Epoch 15/20
1/1 - 0s - loss: 0.0043 - 107ms/epoch - 107ms/step
Epoch 16/20
1/1 - 0s - loss: 0.0125 - 108ms/epoch - 108ms/step
Epoch 

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
