In [10]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


In [11]:

json_file_path = 'C:\\Rohit\\Projects\\Image Data Set\\Flickr30k\\annotations.json'
image_directory = 'C:\\Rohit\\Projects\\Image Data Set\\Flickr30k\\images'


with open(json_file_path, 'r') as f:
    data = json.load(f)

print(f"Total Images : {len(data.items())}")

for image_name, captions in data.items():
    print(f"Image: {image_name}")
    print(f"Captions: {captions['comments']}")
    break


Total Images : 31764
Image: 1000092795.jpg
Captions: ['Two young guys with shaggy hair look at their hands while hanging out in the yard .', 'Two young  White males are outside near many bushes .', 'Two men in green shirts are standing in a yard .', 'A man in a blue shirt standing in a garden .', 'Two friends enjoy time spent together .']


In [12]:
def preprocess_image(image_path, target_size=(299, 299)):
    img = Image.open(image_path).convert('RGB')
    img = img.resize(target_size)
    img_array = np.array(img) / 255.0 
    return img_array


In [13]:

all_captions = []
for captions in data.values():
    all_captions.extend(captions['comments'])


tokenizer = Tokenizer(oov_token='<UNK>', lower=True)
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

captions_sequences = {}
for image_name, captions in data.items():
    captions_sequences[image_name] = tokenizer.texts_to_sequences(captions['comments'])


max_caption_length = 20  # Initially was set as 20 changed to 100

for image_name in captions_sequences:
    captions_sequences[image_name] = pad_sequences(captions_sequences[image_name], maxlen=max_caption_length, padding='post')


In [14]:

image_files = list(data.keys())

train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)

print(f"Training images: {len(train_images)}, Validation images: {len(val_images)}")


Training images: 25411, Validation images: 6353


In [15]:
from tensorflow.keras.utils import to_categorical

def data_generator(image_files, captions_sequences, image_directory, batch_size=32, max_caption_length=20):
    while True:
        for i in range(0, len(image_files), batch_size):
            batch_images = image_files[i:i + batch_size]
            images, captions_input, captions_target = [], [], []
            
            for image_name in batch_images:
                # Load and preprocess the image
                image_path = os.path.join(image_directory, image_name)
                image = preprocess_image(image_path)  # Your image preprocessing function
                images.append(image)
                
                # Randomly select a caption for this image
                caption_seq = captions_sequences[image_name]  # List of possible captions for the image
                random_caption = caption_seq[np.random.randint(0, len(caption_seq))]  # Random caption
                
                # Prepare input (caption excluding the last word) and target (caption shifted by one word)
                caption_input = random_caption[:-1]  # Input to the model (all words except the last one)
                caption_target = random_caption[1:]  # Target output (shifted by one word)

                # Append the input and target captions
                captions_input.append(caption_input)
                captions_target.append(caption_target)
            
            # Pad captions to ensure consistent sequence length to max_caption_length
            captions_input = pad_sequences(captions_input, maxlen=max_caption_length, padding='post')
            captions_target = pad_sequences(captions_target, maxlen=max_caption_length, padding='post')

            # Convert to categorical if needed
            captions_target = to_categorical(captions_target, num_classes=vocab_size)

            # Convert to numpy arrays and yield the batch
            yield (np.array(images), np.array(captions_input)), np.array(captions_target)


In [16]:
def create_tf_dataset(image_files, captions_sequences, image_directory, batch_size=32, max_caption_length=20):
    # Define the output signature for the dataset
    output_signature = (
        (tf.TensorSpec(shape=(None, 299, 299, 3), dtype=tf.float32),  # Images
         tf.TensorSpec(shape=(None, max_caption_length), dtype=tf.int32)),  # Input Captions
        tf.TensorSpec(shape=(None, max_caption_length), dtype=tf.int32)  # Target Captions
    )

    # Wrap the generator using tf.data.Dataset.from_generator
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(image_files, captions_sequences, image_directory, batch_size, max_caption_length),
        output_signature=output_signature
    )

    return dataset


In [17]:
# Import necessary libraries
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate
from tensorflow.keras.applications import InceptionV3  # Example CNN
from tensorflow.keras.optimizers import Adam

# Define the create_model function
def create_model(vocab_size, embedding_dim, max_caption_length):
    # Input for image features (using a CNN model like InceptionV3)
    image_input = Input(shape=(299, 299, 3))  # Example shape for preprocessed image
    cnn_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')  # Example CNN
    image_features = cnn_model(image_input)  # Extract features from the image

    # Input for captions
    caption_input = Input(shape=(max_caption_length,))  # Shape of input captions
    
    # Embedding layer for captions
    caption_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(caption_input)
    
    # LSTM layer for caption generation
    lstm_out = LSTM(256, return_sequences=True)(caption_embedding)
    
    # Concatenate image features with LSTM output
    lstm_out = Flatten()(lstm_out)  # Flatten LSTM output to match image features shape
    combined = Concatenate()([image_features, lstm_out])
    
    # Dense layer for output (one output per time step)
    output = Dense(vocab_size, activation='softmax')(combined)
    
    # Create the model
    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy')  # Change to sparse_categorical_crossentropy
    return model

# Define parameters
vocab_size = 18317  # Adjust this based on your actual vocabulary size
embedding_dim = 256  # You can choose the embedding size as needed
max_caption_length = 20  # Set this to the maximum length of your captions

# Create the model
model = create_model(vocab_size, embedding_dim, max_caption_length)

# Display the model summary
model.summary()


ResourceExhaustedError: {{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[4691200,18317] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu [Op:StatelessRandomUniformV2] name: 

In [9]:
# Set batch size and number of epochs
batch_size = 4
epochs = 10
# Create the TensorFlow datasets for training and validation
train_dataset = create_tf_dataset(train_images, captions_sequences, image_directory, batch_size=batch_size)
val_dataset = create_tf_dataset(val_images, captions_sequences, image_directory, batch_size=batch_size)

# Train the model
history = model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=val_dataset
)


Epoch 1/10




ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 20), output.shape=(None, 18317)

In [None]:
# Save the trained model
model.save('image_captioning_model.h5')
print("Model Created Succesfully")