In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
import numpy as np
from PIL import Image
import os

In [7]:
def load_image(image_path):
    img = Image.open(image_path)
    img = img.resize((224, 224))  # Resize image to a fixed size
    img = np.array(img) / 255.0   # Normalize the image
    print("Image Shape :",img.shape) 
    return img



# Example image path and captions
image_path = "C:\\Rohit\\Projects\\Image Describer\\sample_data\\36979.jpg"
captions = [
    "Several men play cards while around a green table .",
    "A group of people sitting around a table playing a card game .",
    "Men playing cards at a green table in a casino ."
]

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
caption_sequences = tokenizer.texts_to_sequences(captions)
max_caption_length = max([len(seq) for seq in caption_sequences])

caption_sequences = pad_sequences(caption_sequences, maxlen=max_caption_length, padding='post')

print(f"Max Caption Length: {max_caption_length}")
print(f"Vocab Size: {vocab_size}")



Max Caption Length: 12
Vocab Size: 20


In [9]:
# Create a model with VGG16 as the image feature extractor
def create_model(vocab_size, max_caption_length):
    # Input for the image
    image_input = Input(shape=(224, 224, 3))
    cnn = tf.keras.applications.VGG16(include_top=False, input_tensor=image_input)
    cnn.trainable = False
    features = cnn.output
    features = tf.keras.layers.Flatten()(features)
    features = Dense(256, activation='relu')(features)
    
    # Input for the caption
    caption_input = Input(shape=(max_caption_length,))
    x = Embedding(vocab_size, 256)(caption_input)
    x = LSTM(256, return_sequences=False)(x)

    # Combine image and caption
    combined = tf.keras.layers.add([features, x])
    combined = Dense(256, activation='relu')(combined)
    output = Dense(vocab_size, activation='softmax')(combined)

    # Define model
    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    return model


In [10]:
def data_generator(images, captions, batch_size):
    while True:
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i + batch_size]
            batch_captions = captions[i:i + batch_size]
            
            # For each image, we pair it with all captions
            images_out = []
            captions_out = []
            labels_out = []

            for img in batch_images:
                for caption in batch_captions:
                    images_out.append(img)
                    captions_out.append(caption)
                    # Create the one-hot encoded labels
                    label = np.zeros(vocab_size)
                    label[caption[-1]] = 1
                    labels_out.append(label)

            yield [np.array(images_out), np.array(captions_out)], np.array(labels_out)

# Create the model
model = create_model(vocab_size, max_caption_length)
model.summary()

In [11]:
image = load_image(image_path)
image_data = np.expand_dims(image, axis=0)  # Add batch dimension

# Use the same image for multiple captions in the batch
image_data_batch = np.repeat(image_data, len(captions), axis=0)

# Train the model using multiple captions for a single image
model.fit(data_generator(image_data_batch, caption_sequences, batch_size=2),
          steps_per_epoch=len(caption_sequences) // 2, epochs=10)


Image Shape : (224, 224, 3)


TypeError: `output_signature` must contain objects that are subclass of `tf.TypeSpec` but found <class 'list'> which is not.

In [None]:
# Save the model and tokenizer
model.save('model/image_captioning_model_multiple_captions.h5')
with open('model/tokenizer_multiple_captions.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)