In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding
from tensorflow.keras.layers import Add
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load a pre-trained VGG16 model (you can also use ResNet, Inception, etc.)
image_model = VGG16(include_top=True, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-2].output
image_features_extract_model = Model(inputs=new_input, outputs=hidden_layer)

# Define the image preprocessing function
def preprocess_image(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(224, 224))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

# Define the maximum caption length and vocabulary size
max_caption_length = 20
vocab_size = 10000

# Load your dataset with image paths and corresponding captions

# Extract image features and preprocess captions
image_features = {}
captions = []

# Loop over your dataset
for image_path, caption in dataset:
    img = preprocess_image(image_path)
    features = image_features_extract_model.predict(img)
    image_features[image_path] = features[0]
    captions.append(caption)

# Tokenize captions
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(captions)
sequences = tokenizer.texts_to_sequences(captions)
max_sequence_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Prepare the data for training
X = []
y = []

for sequence in sequences:
    for i in range(1, len(sequence)):
        input_seq = sequence[:i]
        output_seq = sequence[i]
        X.append(input_seq)
        y.append(output_seq)

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

# Build the caption generation model using LSTM
input_image_features = Input(shape=(4096,))
image_embedding = Dense(256, activation="relu")(input_image_features)
input_caption = Input(shape=(max_sequence_length,))
caption_embedding = Embedding(input_dim=vocab_size, output_dim=256)(input_caption)
decoder_input = Add()([image_embedding, caption_embedding])
lstm = LSTM(256)(decoder_input)
output = Dense(vocab_size, activation="softmax")(lstm)

captioning_model = Model(inputs=[input_image_features, input_caption], outputs=output)

# Compile the model
captioning_model.compile(loss="categorical_crossentropy", optimizer="adam")

# Train the model with X and y
captioning_model.fit([image_features, X], y, batch_size=32, epochs=10)

# To generate captions for a new image, you can use the trained model
new_image = preprocess_image("path_to_new_image.jpg")
new_features = image_features_extract_model.predict(new_image)
input_caption = ["<OOV>"]
for i in range(max_caption_length):
    sequence = tokenizer.texts_to_sequences(input_caption)
    sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    next_word_index = np.argmax(captioning_model.predict([new_features, sequence]))
    next_word = tokenizer.index_word[next_word_index]
    if next_word == "<OOV>" or next_word is None:
        break
    input_caption.append(next_word)

caption = " ".join(input_caption[1:])
print("Generated Caption:", caption)
