In [4]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np

# Load pre-trained CNN model
cnn_model = ResNet50(weights='imagenet', include_top=False)

# Function to extract image features
def extract_image_features(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = cnn_model.predict(img_array)
    return features


In [5]:
# Example captions
captions = ["a cat is sitting", "a dog is running", "a bird is flying"]

# Initialize the Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Fit the tokenizer on the captions to create a word index
tokenizer.fit_on_texts(captions)

# Convert captions to sequences of integers
sequences = tokenizer.texts_to_sequences(captions)

# Pad sequences to ensure uniform length
max_caption_length = max(len(seq) for seq in sequences)
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_caption_length)

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  

In [6]:
# Define RNN model for caption generation
caption_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(2048,)),  # Image features
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(vocab_size, activation='softmax')  # Output layer with vocab size
])

# Compile the model
caption_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(caption_model.summary())

# Convert captions to one-hot encoded format
# This can be achieved using TensorFlow's built-in functions or custom code
# Here, we'll use TensorFlow's built-in function for illustration
one_hot_captions = np.zeros((len(sequences), max_caption_length, vocab_size))
for i, seq in enumerate(sequences):
    for j, word_index in enumerate(seq):
        one_hot_captions[i, j, word_index] = 1

None


In [7]:
# Train the model
# You'll need to replace 'image_features' with actual image features and 'one_hot_captions' with the correct labels
# caption_model.fit(image_features, one_hot_captions, epochs=10, batch_size=32)

# Function to generate captions for new images
def generate_caption_rnn(image_features):
    # Generate caption using RNN model
    # This function should take image features as input and return the predicted caption
    # Here, for demonstration, we'll return a placeholder caption
    predicted_caption = "Placeholder caption"
    return predicted_caption

In [8]:
# Function to generate captions for new images
def generate_caption(image_path):
    # Extract image features
    img_features = extract_image_features(image_path)
    
    # Generate caption using RNN model
    predicted_caption = generate_caption_rnn(img_features)
    
    return predicted_caption

# Example usage
image_path = 'example_image.png'  # Replace 'example_image.png' with the path to the actual image
predicted_caption = generate_caption(image_path)
print("Generated Caption:", predicted_caption)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Generated Caption: Placeholder caption
