<a href="https://colab.research.google.com/github/Priya-dhar/CODESOFT--Image-Captioning-AI/blob/main/Image-Captioning-AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Extract features from an image using ResNet50
def extract_image_features(image_path):
    base_model = ResNet50(weights='imagenet')
    model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    features = model.predict(img_array)
    return features

# Step 2: Dummy tokenizer (normally trained on a caption dataset)
tokenizer = Tokenizer()
tokenizer.word_index = {
    'startseq': 1, 'a': 2, 'dog': 3, 'running': 4, 'in': 5, 'park': 6, 'endseq': 7
}
vocab_size = len(tokenizer.word_index) + 1
max_length = 6

# Step 3: Build the captioning model
def create_caption_model(vocab_size, max_length):
    image_input = tf.keras.Input(shape=(2048,))
    image_dense = tf.keras.layers.Dense(256, activation='relu')(image_input)

    caption_input = tf.keras.Input(shape=(max_length,))
    embedding = tf.keras.layers.Embedding(vocab_size, 256, mask_zero=True)(caption_input)
    lstm_output = tf.keras.layers.LSTM(256)(embedding)

    merged = tf.keras.layers.add([image_dense, lstm_output])
    hidden = tf.keras.layers.Dense(256, activation='relu')(merged)
    output = tf.keras.layers.Dense(vocab_size, activation='softmax')(hidden)

    return tf.keras.Model(inputs=[image_input, caption_input], outputs=output)

# Step 4: Generate a caption using greedy search
def generate_caption(model, photo, tokenizer, max_length):
    text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)

        prediction = model.predict([photo, sequence], verbose=0)
        word_index = np.argmax(prediction)

        word = ''
        for w, idx in tokenizer.word_index.items():
            if idx == word_index:
                word = w
                break

        if word == 'endseq' or not word:
            break

        text += ' ' + word
    return text.replace('startseq', '').strip()

# Step 5: Run everything together (You can modify the image path)
if __name__ == "__main__":
    image_path = "ima.webp"  # Replace with your own image
    features = extract_image_features(image_path)

    model = create_caption_model(vocab_size, max_length)
    caption = generate_caption(model, features, tokenizer, max_length)

    print("Generated Caption:", caption)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Generated Caption: park park park park park park
