In [64]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


In [65]:

json_file_path = 'C:\\Rohit\\Projects\\Image Data Set\\Flickr30k\\annotations.json'
image_directory = 'C:\\Rohit\\Projects\\Image Data Set\\Flickr30k\\images'


with open(json_file_path, 'r') as f:
    data = json.load(f)

print(f"Total Images : {len(data.items())}")

for image_name, captions in data.items():
    print(f"Image: {image_name}")
    print(f"Captions: {captions['comments']}")
    break


Total Images : 31764
Image: 1000092795.jpg
Captions: ['Two young guys with shaggy hair look at their hands while hanging out in the yard .', 'Two young  White males are outside near many bushes .', 'Two men in green shirts are standing in a yard .', 'A man in a blue shirt standing in a garden .', 'Two friends enjoy time spent together .']


In [66]:
def preprocess_image(image_path, target_size=(299, 299)):
    img = Image.open(image_path).convert('RGB')
    img = img.resize(target_size)
    img_array = np.array(img) / 255.0 
    return img_array


In [67]:

all_captions = []
for captions in data.values():
    all_captions.extend(captions['comments'])


tokenizer = Tokenizer(oov_token='<UNK>', lower=True)
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

captions_sequences = {}
for image_name, captions in data.items():
    captions_sequences[image_name] = tokenizer.texts_to_sequences(captions['comments'])


max_caption_length = 100  # Initially was set as 20 changed to 100

for image_name in captions_sequences:
    captions_sequences[image_name] = pad_sequences(captions_sequences[image_name], maxlen=max_caption_length, padding='post')


In [68]:

image_files = list(data.keys())

train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

print(f"Training images: {len(train_images)}, Validation images: {len(val_images)}")


Training images: 25411, Validation images: 6353


In [69]:
import os
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf

def data_generator(image_files, captions_sequences, image_directory, batch_size=32, max_caption_length=20):
    while True:
        for i in range(0, len(image_files), batch_size):
            batch_images = image_files[i:i + batch_size]
            images, captions_input, captions_target = [], [], []
            
            for image_name in batch_images:
                # Load and preprocess the image
                image_path = os.path.join(image_directory, image_name)
                image = preprocess_image(image_path)  # Your image preprocessing function
                images.append(image)
                
                # Randomly select a caption for this image
                caption_seq = captions_sequences[image_name]  # List of possible captions for the image
                random_caption = caption_seq[np.random.randint(0, len(caption_seq))]  # Random caption
                
                # Prepare input (caption excluding the last word) and target (caption shifted by one word)
                caption_input = random_caption[:-1]  # Input to the model (all words except the last one)
                caption_target = random_caption[1:]  # Target output (shifted by one word)
                
                captions_input.append(caption_input)
                captions_target.append(caption_target)
            
            # Pad captions to ensure consistent sequence length
            captions_input = pad_sequences(captions_input, maxlen=max_caption_length, padding='post')
            captions_target = pad_sequences(captions_target, maxlen=max_caption_length, padding='post')

            # Convert to TensorFlow tensors and yield the batch
            yield [tf.convert_to_tensor(np.array(images), dtype=tf.float32), 
                   tf.convert_to_tensor(np.array(captions_input), dtype=tf.int32)], \
                  tf.convert_to_tensor(np.array(captions_target), dtype=tf.int32)

train_generator = data_generator(train_images, captions_sequences, image_directory, batch_size=4)
train_batch = next(train_generator)
print(f"Image batch shape: {train_batch[0][0].shape}, Caption input batch shape: {train_batch[0][1].shape}, Caption target batch shape: {train_batch[1].shape}")


Image batch shape: (4, 299, 299, 3), Caption input batch shape: (4, 20), Caption target batch shape: (4, 20)


In [70]:
def create_model(vocab_size, max_caption_length):
    # Image input
    image_input = Input(shape=(299, 299, 3))
    image_flatten = Flatten()(image_input)

    # Caption input
    caption_input = Input(shape=(max_caption_length,))
    embedding_layer = Embedding(vocab_size, 256)(caption_input)
    lstm_layer = LSTM(256)(embedding_layer)

    # Combine both inputs
    combined = Dense(512, activation='relu')(image_flatten)
    combined = Dropout(0.5)(combined)
    combined = Dense(512, activation='relu')(combined)

    # Output layer
    output = Dense(vocab_size, activation='softmax')(combined)

    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(optimizer=Adam(), loss='categorical_crossentropy')
    return model

# Create the model
model = create_model(vocab_size, max_caption_length)
model.summary()


In [71]:
def create_model(vocab_size, max_caption_length):
    # Image input
    image_input = Input(shape=(299, 299, 3))
    image_flatten = Flatten()(image_input)

    # Caption input
    caption_input = Input(shape=(max_caption_length,))
    embedding_layer = Embedding(vocab_size, 256)(caption_input)
    lstm_layer = LSTM(256)(embedding_layer)

    # Combine both inputs
    combined = Dense(512, activation='relu')(image_flatten)
    combined = Dropout(0.5)(combined)
    combined = Dense(512, activation='relu')(combined)

    # Output layer
    output = Dense(vocab_size, activation='softmax')(combined)

    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(optimizer=Adam(), loss='categorical_crossentropy')
    return model

# Create the model
model = create_model(vocab_size, max_caption_length)
model.summary()


In [72]:
# Define batch size and number of epochs
batch_size = 32
epochs = 10
steps_per_epoch = len(train_images) // batch_size

# Train the model
#train_generator = data_generator(train_images, captions_sequences, image_directory, batch_size=4)
#train_batch = next(train_generator)

history = model.fi  t(train_generator,
                    steps_per_epoch=steps_per_epoch,
                    epochs=epochs,
                    validation_data=data_generator(val_images,captions_sequences,image_directory, batch_size),
                    validation_steps=len(val_images) // batch_size)


TypeError: `output_signature` must contain objects that are subclass of `tf.TypeSpec` but found <class 'list'> which is not.

In [None]:
# Save the trained model
model.save('image_captioning_model.h5')
print("Model Created Succesfully")