<a href="https://colab.research.google.com/github/Sanmuga/Image-caption-generator/blob/main/Image_caption_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tensorflow

In [None]:
pip install numpy pandas

In [None]:
pip install Pillow

In [None]:
pip install os-sys

In [None]:
pip install numpy Pillow tensorflow tqdm

In [None]:
pip install zipfile36

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Load the pre-trained InceptionV3 model
inception_model = InceptionV3(weights='imagenet')
inception_model = Model(inception_model.input, inception_model.layers[-2].output)

# Load image filenames and caption texts from input text files
def load_data_from_text_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

image_filenames = load_data_from_text_file('/content/Flickr_8k.trainImages.txt')
captions = load_data_from_text_file('/content/Flickr8k.token.txt')

# Tokenize caption texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(captions)
max_seq_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Prepare image features
def preprocess_image(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(299, 299))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    img = np.expand_dims(img, axis=0)
    return img

image_features = []
for image_filename in image_filenames:
    image_features.append(inception_model.predict(preprocess_image(image_filename)))
image_features = np.array(image_features).squeeze()

# Create captioning model
input_image_features = tf.keras.layers.Input(shape=(2048,))
input_caption = tf.keras.layers.Input(shape=(max_seq_length,))

image_features_dropout = tf.keras.layers.Dropout(0.5)(input_image_features)
image_features_embedding = tf.keras.layers.Dense(256, activation='relu')(image_features_dropout)

caption_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=256)(input_caption)
caption_lstm = tf.keras.layers.LSTM(256)(caption_embedding)

decoder_input = tf.keras.layers.add([image_features_embedding, caption_lstm])
output = tf.keras.layers.Dense(vocab_size, activation='softmax')(decoder_input)

captioning_model = Model(inputs=[input_image_features, input_caption], outputs=output)
captioning_model.compile(loss='categorical_crossentropy', optimizer=Adam())

# Prepare target sequences for training
target_sequences = np.roll(padded_sequences, -1, axis=1)  # Shift sequences one step forward
target_sequences[:, -1] = 0  # Set last token to 0
target_sequences = to_categorical(target_sequences, num_classes=vocab_size)

# Train the model
captioning_model.fit([image_features, padded_sequences], target_sequences, epochs=10, batch_size=32)

# Generate captions for new images
def generate_caption(image_filename, captioning_model, tokenizer):
    image_feature = inception_model.predict(preprocess_image(image_filename)).squeeze()
    initial_caption = 'start'
    for _ in range(max_seq_length):
        sequence = tokenizer.texts_to_sequences([initial_caption])[0]
        sequence = pad_sequences([sequence], maxlen=max_seq_length)
        predicted_word_index = np.argmax(captioning_model.predict([image_feature.reshape(1, -1), sequence]))
        predicted_word = tokenizer.index_word.get(predicted_word_index, "<OOV>")
        if predicted_word == 'end':
            break
        initial_caption += ' ' + predicted_word
    return initial_caption

# Example usage
new_image_filename = '/content/99171998_7cc800ceef.jpg'
generated_caption = generate_caption(new_image_filename, captioning_model, tokenizer)
print("Generated Caption:", generated_caption)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


FileNotFoundError: ignored