In [1]:
import os
import string
import numpy as np
from pickle import dump, load
from tqdm import tqdm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, LSTM, add, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu

In [2]:
with open('Resnet_features.pkl', 'rb') as file:
    features = load(file)

In [3]:
def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text


doc = load_doc('/content/preprocess_dis.txt')
descriptions = dict()
for line in doc.split('\n'):
    tokens = line.split()

    if len(tokens) < 2:
        continue
    image_id, image_desc = tokens[0], tokens[1:]
    image_desc= " ".join(image_desc)
    #image_id = image_id.split('.')[0]
    if image_id not in descriptions:
        descriptions[image_id] = []
    descriptions[image_id].append(image_desc)

In [5]:
image_ids = list(descriptions.keys())
split= int(len(image_ids)*0.90)
train = image_ids[:split]
test = image_ids[split:]

In [6]:
def data_generator(img_ids, tokenizer, max_length, descriptions, img, vocab_size, batch_size):
  X1, X2, y = list(), list(), list()
  n=0
  while True:
    for key in img_ids:
        for desc in descriptions[key]:
            n+=1
            seq = tokenizer.texts_to_sequences([desc])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post', truncating='post')[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(img[key][0])
                X2.append(in_seq)
                y.append(out_seq)

        if (n>=batch_size):
            X1, X2, y = np.array(X1), np.array(X2), np.array(y)
            yield (X1, X2), y
            n=0
            X1, X2, y = list(), list(), list()

In [7]:
with open('tokenizer.pkl', 'rb') as file:
    tokenizer = load(file)
vocab_size = len(tokenizer.word_index) + 1
max_len = 31


In [8]:

# Feature encoding
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# Sequence encoding
inputs2 = Input(shape=(max_len,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256, return_sequences=True)(se2)  

# Add multihead attention after the LSTM
attention = MultiHeadAttention(num_heads=4, key_dim=256)(se3, se3)  
attention_output = GlobalAveragePooling1D()(attention)


decoder1 = add([fe2, attention_output])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# Compile the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer=Adam())


In [9]:
model.summary()

In [10]:
epochs=20
batch_size=64
steps = len(train) // batch_size

for i in range(epochs):
    data = data_generator(train, tokenizer, max_len, descriptions, features, vocab_size, batch_size)
    model.fit(data, epochs=1, steps_per_epoch= steps, verbose=1)



[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 133ms/step - loss: 6.3968
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 157ms/step - loss: 4.9495
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 128ms/step - loss: 4.3410
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 123ms/step - loss: 3.9222
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 123ms/step - loss: 3.6418
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 138ms/step - loss: 3.3987
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 122ms/step - loss: 3.1783
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 129ms/step - loss: 3.0008
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 124ms/step - loss: 2.8540
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 124ms/step - loss: 2.7018
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 124

In [16]:
model.save("img_caption_model_Attention.keras")

In [21]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=31, padding='post', truncating='post')
        photo_reshaped = photo.reshape(1, photo.shape[1])
        yhat = model.predict([photo_reshaped,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [None]:
def evaluate_model(model, img_ids, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()

    # Step through each image and generate predictions
    for key in tqdm(img_ids):
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        references = [d.split() for d in descriptions[key]]
        actual.append(references)
        predicted.append(yhat.split())

    # Calculate BLEU score for cumulative n-grams up to 4-grams
    bleu_1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
    bleu_2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
    bleu_3 = corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0))
    bleu_4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))

    print(f'BLEU-1: {bleu_1:.4f}')
    print(f'BLEU-2: {bleu_2:.4f}')
    print(f'BLEU-3: {bleu_3:.4f}')
    print(f'BLEU-4: {bleu_4:.4f}')

In [None]:
evaluate_model(model, test, descriptions, features, tokenizer, 31)

100%|██████████| 810/810 [07:56<00:00,  1.70it/s]


BLEU-1: 0.5212
BLEU-2: 0.2860
BLEU-3: 0.1663
BLEU-4: 0.0885


In [14]:
from tensorflow.keras.models import load_model

In [19]:
model= load_model("/content/img_caption_model_Attention.keras")

In [4]:
print(descriptions['1001773457_577c3a7d70'])

['startseq black dog and spotted dog are fighting endseq', 'startseq black dog and tricolored dog playing with each other on the road endseq', 'startseq black dog and white dog with brown spots are staring at each other in the street endseq', 'startseq two dogs of different breeds looking at each other on the road endseq', 'startseq two dogs on pavement moving toward each other endseq']


In [23]:
photo = features['1001773457_577c3a7d70']  # extracted image features
description = generate_desc(model, tokenizer, photo, max_len)
print(description)

startseq two white dogs are playing with each other endseq
