# Image Captioning with CNN-LSTM

In [3]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [4]:
# Load and preprocess the pre-trained CNN model (InceptionV3)
def preprocess_image(image_path):
    image = tf.keras.utils.load_img(image_path, target_size=(299, 299))  # Resize image
    image = tf.keras.utils.img_to_array(image)  # Convert to array
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    image = tf.keras.applications.inception_v3.preprocess_input(image)  # Preprocess for InceptionV3
    return image

In [5]:
# Load InceptionV3 and extract features
inception_model = InceptionV3(weights="imagenet")
cnn_model = Model(inception_model.input, inception_model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
[1m96112376/96112376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 1us/step


In [6]:
# Function to extract image features
def extract_features(image_path, model):
    image = preprocess_image(image_path)
    features = model.predict(image)
    return features

In [7]:
# Define the LSTM-based language model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))  # CNN features
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation="relu")(fe1)

    inputs2 = Input(shape=(max_length,))  # Text input
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

In [8]:
# Dummy Dataset for Text Captions
captions = {
    "image1.jpg": ["a dog running in a park", "a canine playing outdoors"],
    "image2.jpg": ["a cat sitting on a couch", "a feline relaxing indoors"],
}

In [9]:
# Tokenize captions
tokenizer = Tokenizer()
all_captions = [caption for captions_list in captions.values() for caption in captions_list]
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1


In [10]:
# Prepare data for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, descriptions, photo_features):
    X1, X2, y = [], [], []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            seq = tokenizer.texts_to_sequences([desc])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(photo_features[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [14]:
# Example image features
photo_features = {
    "image1.jpg": extract_features("C:/Users/swaro/30 day challenge/Day21/image1.jpg", cnn_model),
    "image2.jpg": extract_features("C:/Users/swaro/30 day challenge/Day21/image2.jpg", cnn_model),
}

X1, X2, y = create_sequences(tokenizer, max_length, captions, photo_features)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step


In [15]:
# Define the model
model = define_model(vocab_size, max_length)
model.compile(optimizer="adam", loss="categorical_crossentropy")


In [16]:
# Train the model (dummy training for demonstration)
model.fit([X1, X2], y, epochs=10, batch_size=2)

Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - loss: 3.6764
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.8143
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 2.7490
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 2.6901
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 2.2940
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 2.3747
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.4419
Epoch 8/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 2.3096
Epoch 9/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 2.1294
Epoch 10/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 2.1165


<keras.src.callbacks.history.History at 0x2a75c34aa90>

In [17]:
# Caption Generation
def generate_caption(model, tokenizer, photo_feature, max_length):
    input_text = "startseq"
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([input_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_feature, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None or word == "endseq":
            break
        input_text += " " + word
    return input_text.split(" ")[1:]

In [21]:
# Test with an example
test_image_feature = extract_features("C:/Users/swaro/30 day challenge/Day21/test image.jpg", cnn_model)
caption = generate_caption(model, tokenizer, test_image_feature, max_length)
print("Generated Caption:", " ".join(caption))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step
Generated Caption: cat cat couch couch couch couch
