In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, add
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.translate.bleu_score import corpus_bleu
import cv2
import matplotlib.pyplot as plt

def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(299, 299))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img

def extract_features(img_path, model):
    img = preprocess_image(img_path)
    features = model.predict(img, verbose=0)
    return features

def load_dataset(df):
    images = df['image'].values
    captions = df['caption'].values
    return images, captions

def create_tokenizer(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

def max_length(captions):
    return max(len(caption.split()) for caption in captions)

def define_model(vocab_size, max_length):
    inputs1 = tf.keras.layers.Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = tf.keras.layers.Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def train_model(model, tokenizer, max_length, df, photos, epochs=20):
    for i in range(epochs):
        for idx, row in df.iterrows():
            photo = photos[row['image']]
            caption = row['caption']
            seq = tokenizer.texts_to_sequences([caption])[0]
            for j in range(1, len(seq)):
                in_seq, out_seq = seq[:j], seq[j]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=len(tokenizer.word_index) + 1)[0]
                model.fit([photo, in_seq], out_seq, epochs=1, verbose=0)

def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Example Usage
def main():
    # Load the dataset into a DataFrame
    data = {
        'image': ['dif.png', 'Designer (4).jpeg', 'Designer (5).jpeg','Designer (6).jpeg' ],
        'caption': ['a dog is running in a field', 'a dog is playing outside', 'a cat is sitting on a sofa', 'a cat is relaxing indoors']
    }
    df = pd.DataFrame(data)
    
    # Load InceptionV3 model
    base_model = InceptionV3(weights='imagenet')
    model = Model(base_model.input, base_model.layers[-2].output)
    
    # Load dataset
    images, all_captions = load_dataset(df)
    
    # Create tokenizer
    tokenizer = create_tokenizer(all_captions)
    vocab_size = len(tokenizer.word_index) + 1
    max_length_ = max_length(all_captions)
    
    # Define the captioning model
    caption_model = define_model(vocab_size, max_length_)
    
    # Extract features for all images
    photos = {img: extract_features(img, model) for img in images}
    
    # Train the model
    train_model(caption_model, tokenizer, max_length_, df, photos, epochs=20)
    
    # Generate caption for a test image
    test_image ='image1.jpg'
    features = extract_features(test_image, model)
    caption = generate_caption(caption_model, tokenizer, features, max_length_)
    print(caption)

if __name__ == "_main_":
    main()