In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import pickle

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.image import img_to_array,load_img
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Dense,Dot,Dropout,Attention,RepeatVector,Bidirectional,Lambda,LSTM,concatenate,Embedding,Activation,Multiply,Input

In [3]:
from PIL import Image
from math import ceil
from tqdm.notebook import tqdm
from collections import defaultdict
from nltk.translate.bleu_score import corpus_bleu

In [4]:
vgg16=VGG16()
vgg16=Model(inputs=vgg16.inputs,outputs=vgg16.layers[-2].output)
print(vgg16.summary())

None


In [None]:
image_features={}
img_dir=os.path.join('Images')
for img_name in tqdm(os.listdir(img_dir)):
	img_path = os.path.join(img_dir, img_name)
	image = load_img(img_path, target_size=(224, 224))
	image = img_to_array(image)
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	image = preprocess_input(image)
	image_feature = vgg16.predict(image, verbose=0)
	image_id = img_name.split('.')[0]
	image_features[image_id] = image_feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [None]:
with open ('img_features.pkl','wb') as f:
    pickle.dump(image_features,f)

In [5]:
import os

if os.path.getsize('img_features.pkl') > 0:
    with open('img_features.pkl', 'rb') as f:
        loaded_features = pickle.load(f)
else:
    raise ValueError("img_features.pkl is empty. Please ensure the file is written correctly before loading.")

In [6]:
with open('captions.txt','rb') as f:
    caption_doc= f.read()

In [7]:
mapping=defaultdict(list)
for line in tqdm(caption_doc.split(b'\n')):
    if(len(line)<2):
        continue
    line=line.decode('utf-8')
    image_id,caption=line.split(',',1)
    image_id=image_id.split('.')[0]
    mapping[image_id].append(caption)   

  0%|          | 0/40456 [00:00<?, ?it/s]

In [8]:
def clean(mapping):
    for key,values in mapping.items():
        for i in range(len(values)):
            caption=values[i]
            caption=''.join(char for char in caption if char.isalpha() or char.isspace())
            caption=' '.join(caption.split())
            caption='startseq ' + ' '.join(word for word in caption.split() if len(word)>1) + ' endseq'
            caption=caption.lower()
            values[i]=caption

In [9]:
clean(mapping)

In [10]:
all_captions=[caption for captions in mapping.values() for caption in captions]

In [11]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
with open('tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer,f)

In [12]:
with open('tokenizer.pkl','rb') as f:
    tokenizer=pickle.load(f)

In [13]:
max_len=max(len(caption.split()) for caption in all_captions)
vocab_size=len(tokenizer.word_index)+1
print("Maximum Caption Length : ",max_len)
print("Vocabulary Size : ",vocab_size)

Maximum Caption Length :  34
Vocabulary Size :  8768


In [14]:
image_ids= list(mapping.keys())
split=int(0.9 * len(image_ids))
train_image_ids = image_ids[:split]
test_image_ids = image_ids[split:]

In [15]:
def data_generator(data_keys, mapping, features, tokenizer, max_len, vocab_size, batch_size):
	X1, X2, y = list(), list(), list()
	batch_count = 0

	while True:
		for image_id in data_keys: 
			captions = mapping[image_id]
			for caption in captions:
				caption_seq = tokenizer.texts_to_sequences([caption])[0]
				for i in range(1, len(caption_seq)):
					in_seq, out_seq = caption_seq[:i], caption_seq[i]
					in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
					out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
					X1.append(features[image_id][0])
					X2.append(in_seq)
					y.append(out_seq)
					batch_count += 1
					if batch_count == batch_size:
						X1, X2, y = np.array(X1), np.array(X2), np.array(y)
						yield (X1, X2), y
						X1, X2, y = list(), list(), list()
						batch_count = 0


In [16]:
# Define a model using Bidirectional LSTM and an attention mechanism

# Image feature extractor model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# Sequence model
inputs2 = Input(shape=(max_len,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = Bidirectional(LSTM(256, return_sequences=True))(se2)


from tensorflow.keras.layers import GlobalAveragePooling1D
se3_pooled = GlobalAveragePooling1D()(se3)

decoder1 = concatenate([fe2, se3_pooled])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# Define the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()


In [17]:
epochs=20
batch_size=32
steps_per_epoch = ceil(len(train_image_ids)/ batch_size)
validation_steps = ceil(len(test_image_ids) / batch_size)

In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_generator = data_generator(train_image_ids, mapping, loaded_features, tokenizer, max_len, vocab_size, batch_size)
    test_generator = data_generator(test_image_ids, mapping, loaded_features, tokenizer, max_len, vocab_size, batch_size)
    
    model.fit(train_generator, epochs=1, steps_per_epoch=steps_per_epoch,
          validation_data=test_generator, validation_steps=validation_steps,
          verbose=1)

Epoch 1/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 152ms/step - loss: 6.7588 - val_loss: 6.0801
Epoch 2/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 153ms/step - loss: 4.9882 - val_loss: 6.0547
Epoch 3/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 156ms/step - loss: 4.5898 - val_loss: 6.4006
Epoch 4/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 162ms/step - loss: 4.1983 - val_loss: 6.7044
Epoch 5/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 159ms/step - loss: 3.9328 - val_loss: 6.2390
Epoch 6/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 166ms/step - loss: 3.6469 - val_loss: 6.2735
Epoch 7/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 165ms/step - loss: 3.4499 - val_loss: 6.3059
Epoch 8/20
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 159ms/step - loss: 3.3006 - val_loss: 6.7724
Epoch 9/20
[1m2

In [None]:
model.save('mymodel.keras')

In [18]:
from tensorflow.keras.models import load_model
model=load_model('mymodel.keras')

In [31]:
epochs=20
for epoch in range(epochs):
    print(f"Epoch {epoch+1+40}/{epochs+40}")
    train_generator = data_generator(train_image_ids, mapping, loaded_features, tokenizer, max_len, vocab_size, batch_size)
    test_generator = data_generator(test_image_ids, mapping, loaded_features, tokenizer, max_len, vocab_size, batch_size)
    
    model.fit(train_generator, epochs=1, steps_per_epoch=steps_per_epoch,
          validation_data=test_generator, validation_steps=validation_steps,
          verbose=1)

Epoch 41/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 142ms/step - loss: 0.8802 - val_loss: 11.0034
Epoch 42/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 168ms/step - loss: 0.9475 - val_loss: 10.9863
Epoch 43/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 176ms/step - loss: 0.8867 - val_loss: 10.3320
Epoch 44/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 206ms/step - loss: 0.8823 - val_loss: 11.1477
Epoch 45/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 193ms/step - loss: 0.9058 - val_loss: 10.7241
Epoch 46/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 181ms/step - loss: 0.8695 - val_loss: 12.0035
Epoch 47/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 184ms/step - loss: 0.8042 - val_loss: 11.8522
Epoch 48/60
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 189ms/step - loss: 0.7929 - val_loss: 11.9657


In [32]:
model.save('mymodel2.keras')

In [34]:
from tensorflow.keras.models import load_model
model=load_model('mymodel2.keras')

In [35]:
print(model)
model.summary()

<Functional name=functional_3, built=True>


In [36]:
def get_word_from_index(index, tokenizer):
    return next((word for word, idx in tokenizer.word_index.items() if idx == index), None)

In [37]:
def predict_caption(model, image_feature, tokenizer, max_len):
	in_text = 'startseq'
	for _ in range(max_len):
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		sequence = pad_sequences([sequence], maxlen=max_len)
		yhat = model.predict([image_feature, sequence], verbose=0)
		yhat = np.argmax(yhat)
		word = get_word_from_index(yhat, tokenizer)
		if word is None or word == 'endseq':
			break
		in_text += ' ' + word
	return in_text.replace('startseq', '').replace('endseq', '').strip()

In [38]:
# Extract features for a single image using the VGG16 model
def extract_image_feature(img_path, model):
	image = load_img(img_path, target_size=(224, 224))
	image = img_to_array(image)
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	image = preprocess_input(image)
	feature = model.predict(image, verbose=0)
	return feature

In [39]:
predict_caption(
	model,
	extract_image_feature(os.path.join('Images', '3739833689_a0038545bd.jpg'), vgg16),
	tokenizer,
	max_len
)

'young boy is standing through the grass'

In [40]:
tf.__version__

'2.19.0'