In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.tokenize import word_tokenize

In [None]:
from tensorflow.keras.applications import VGG16

# Load the VGG16 model with pre-trained ImageNet weights
model = VGG16(weights='imagenet', include_top=True)


In [None]:
from PIL import Image

# Open image files
img1 = Image.open('/content/image1.jfif')
img2 = Image.open('/content/image2.jfif')
img3 = Image.open('/content/image3.jfif')


In [None]:
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array


In [None]:
def load_captions(image_ids, captions):
    tokenized_captions = [word_tokenize(caption.lower()) for caption in captions]
    return tokenized_captions


In [None]:
def tokenize_captions(captions):
    tokenized_captions = [word_tokenize(caption.lower()) for caption in captions]
    return tokenized_captions

In [None]:
def create_vocabulary(tokenized_captions):
    vocabulary = set()
    for caption in tokenized_captions:
        vocabulary.update(caption)
    return vocabulary


In [None]:
def create_word_index_mappings(vocabulary):
    word_to_index = {word: idx + 1 for idx, word in enumerate(vocabulary)}
    index_to_word = {idx: word for word, idx in word_to_index.items()}
    return word_to_index, index_to_word

In [None]:
def extract_image_features(image_path):
    img_array = preprocess_image(image_path)
    features = model.predict(img_array)
    return features

In [None]:
# Prepare data for training
def prepare_data(image_ids, captions, max_seq_length, word_to_index):
    X1, X2, y = [], [], []
    for i in range(len(image_ids)):
        for caption in captions[i]:
            seq = [word_to_index[word] for word in caption.split() if word in word_to_index]
            for j in range(1, len(seq)):
                in_seq, out_seq = seq[:j], seq[j]
                in_seq = pad_sequences([in_seq], maxlen=max_seq_length)[0]
                out_seq = to_categorical([out_seq], num_classes=len(word_to_index) + 1)[0]
                X1.append(extract_image_features(f"images/{image_ids[i]}.jpg"))
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = tf.keras.layers.add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    return model

In [None]:
image_ids = ['image1', 'image2', 'image3']  # Example list of image IDs
captions = [
    ["a black dog is running on the grass"],  # Example captions for image1
    ["a brown horse is grazing in the field"],  # Example captions for image2
    ["a person is riding a bicycle on the road"]  # Example captions for image3
]



In [None]:
def create_vocabulary(tokenized_captions):
    vocabulary = set()
    for caption_list in tokenized_captions:
        for caption in caption_list:
            vocabulary.update(caption)
    return vocabulary


In [None]:
def tokenize_captions(captions):
    tokenized_captions = []
    for caption_list in captions:
        tokenized_caption_list = []
        for caption in caption_list:
            tokenized_caption_list.append(word_tokenize(caption.lower()))
        tokenized_captions.append(tokenized_caption_list)
    return tokenized_captions


In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Tokenize captions
tokenized_captions = tokenize_captions(captions)


In [None]:
# Tokenize captions
tokenized_captions = tokenize_captions(captions)

# Create vocabulary
vocabulary = create_vocabulary(tokenized_captions)

# Map words to indices and vice versa
word_to_index, index_to_word = create_word_index_mappings(vocabulary)


In [None]:
max_seq_length = max(len(seq) for seq in tokenized_captions)


In [None]:
for caption in captions:
    seq = [word_to_index[word] for word in " ".join(caption).split() if word in word_to_index]
    # Further processing using seq


In [None]:
seq = [word_to_index[word] for word in " ".join(caption).split() if word in word_to_index]


In [None]:
model = define_model(len(vocabulary) + 1, max_seq_length)


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_11 (InputLayer)       [(None, 1)]                  0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, 4096)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 1, 256)               4608      ['input_11[0][0]']            
                                                                                                  
 dropout_4 (Dropout)         (None, 4096)                 0         ['input_10[0][0]']            
                                                                                            

In [None]:
# Verify data types
print(type(X1), type(X2), type(y))

# Verify data shapes if they are not ellipsis objects
if not isinstance(X1, type(...)) and not isinstance(X2, type(...)) and not isinstance(y, type(...)):
    print(X1.shape, X2.shape, y.shape)


<class 'ellipsis'> <class 'ellipsis'> <class 'ellipsis'>


In [None]:
model.save('image_captioning_model.h5')

  saving_api.save_model(


In [None]:
import os

# Check if the file exists
if os.path.exists('image_captioning_model.h5'):
    print("Model saved successfully.")
else:
    print("Error: Model not saved.")


Model saved successfully.


In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('image_captioning_model.h5')


In [None]:
# Display model summary
loaded_model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_8 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_7 (InputLayer)        [(None, 4096)]               0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 1, 256)               4608      ['input_8[0][0]']             
                                                                                                  
 dropout_2 (Dropout)         (None, 4096)                 0         ['input_7[0][0]']             
                                                                                            