In [1]:
import os
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.applications.resnet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

from keras.models import Model
import pickle as pk
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import keras
from tqdm import tqdm

In [2]:
model = ResNet50V2(
    include_top=False,
    weights="imagenet",
    pooling = 'avg'
                   )

print(model.summary())

None


In [2]:
#load captions from features
dir = "D:/internship_project/internship_project/Image_Captioning/dataset/captions.txt"
with open(dir, 'r') as cap:
    captions = cap.read()

In [3]:
#Feature extraction from images
from tqdm import tqdm
def feature_gather():
    feature_info = {}
    dir = "D:/internship_project/internship_project/Image_Captioning/dataset/Images"
    for i in tqdm(os.listdir(dir)):
        
        image = load_img(os.path.join(dir, i), target_size=(224, 224))
        image = img_to_array(image)/255.0
        image = image.reshape((1, 224, 224, 3))        
        image = preprocess_input(image)
        temp = model.predict(image, verbose=0)
        feature_info[i.split('.')[0]] = temp
        # print(i.split('.')[0])
    return feature_info
        


In [5]:
feature_gathered = feature_gather()

In [5]:

#Saving the features, what I extract from images
pk.dump(feature_gathered, open('feature_gathered.pkl', 'wb'))

In [5]:
feature_gathered = pk.load(open('feature_gathered.pkl', 'rb'))

### Preprocess captions

In [6]:

import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = 'startseq ' + text + ' endseq'
    return text

#In the dataset for 1 images, there are some captions
def preprocess_captions(captions):
    
    find_captions = {}
    gather_all_captions = list()
    maximum_length = 0

    for cap in tqdm(captions.split('\n')):
        # print(cap)
        # print()
        try:
            temp = cap.split(',')
            id = temp[0].split('.')[0]
            if id not in find_captions:
                find_captions[id] = list()

            CleanText = clean_text(temp[1])
#             print(CleanText)
            find_captions[id].append(CleanText)
            gather_all_captions.append(CleanText)

            if len(CleanText.split()) > maximum_length:
                maximum_length = len(CleanText.split())
        except:
            continue

    return find_captions, gather_all_captions, maximum_length


find_captions, gather_all_captions, maximum_length = preprocess_captions(captions)


100%|████████████████████████████████████████████████████████████████████████| 40456/40456 [00:00<00:00, 132834.48it/s]


In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(gather_all_captions)

vocab_size = len(tokenizer.word_index) + 1

vocab_size

8588

In [8]:
train, test = train_test_split(list(find_captions.keys()), test_size=0.10, random_state=42)

In [9]:
print(len(train))
print(len(test))
print(train[:5])

7282
810
['2715289538_d77c8d0a85', '394136487_4fc531b33a', '2378356400_f6bde5d9b3', '2985679744_75a7102aab', '2926595608_69b22be8d4']


### Create a LSTM model

In [10]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout, add, Input, Embedding, LSTM

def lstm_model_creation(maximum_length, vocab_size):
    input_feature_img = Input(shape=(2048, ), name="image")
    in_1_img = Dropout(0.4)(input_feature_img)
    in_2_img = Dense(256, activation='relu')(in_1_img)

    input_feature_captions = Input(shape=(maximum_length,), name="text")
    embd = Embedding(vocab_size, 256, mask_zero=True)
    in_1_cap = embd(input_feature_captions)
    in_2_cap = Dropout(0.4)(in_1_cap)
    in_3_cap = LSTM(256)(in_2_cap)

    de1 = add([in_2_img, in_3_cap])
    de2 = Dense(256, activation='relu')(de1)
    de3 = Dropout(0.3)(de2) 
    outputs = Dense(vocab_size, activation='softmax')(de3)

    model_input = [input_feature_img, input_feature_captions]
    lstm_model = Model(inputs=model_input, outputs=outputs)

    return lstm_model

In [11]:
lstm_model = lstm_model_creation(maximum_length, vocab_size)

In [12]:
lstm_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy'
)

### Run the model

In [12]:


def data_normalization(train, feature_gathered, find_captions, maximum_length, vocab_size, batch_size, tokenizer):
    
    img_feature = []
    input_seq = []
    output_seq =  []
    count = 0
    while True:
        for image_id in train:
            count += 1
            caps = find_captions[image_id]
            
            for cap in caps:
            
                sequence = tokenizer.texts_to_sequences([cap])[0]
              
                length_of_seq = len(sequence)
                for each_seq in range(1, length_of_seq):
                 
                    input_of_each_seq =sequence[:each_seq]
                    input_of_each_seq = pad_sequences([input_of_each_seq], maxlen=maximum_length)[0]
                    output_of_each_seq = sequence[each_seq]

                    img_feature.append(feature_gathered[image_id][0])
                    input_seq.append(input_of_each_seq)
                    output_seq.append(to_categorical([output_of_each_seq], num_classes=vocab_size)[0])
            if count == batch_size:
                yield {"image": np.array(img_feature), "text": np.array(input_seq)}, np.array(output_seq)
                count = 0
                img_feature = []
                input_seq = []
                output_seq =  []
                

#Training the model

batch_size = 32
steps = len(train) // batch_size

for i in range(15):
    normalization = data_normalization(train, feature_gathered, find_captions, maximum_length, vocab_size, batch_size, tokenizer)
    lstm_model.fit(normalization, epochs=1, steps_per_epoch=steps, verbose=1)
    


[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 2s/step - loss: 5.7240
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m498s[0m 2s/step - loss: 3.9533
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m524s[0m 2s/step - loss: 3.6067
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m559s[0m 2s/step - loss: 3.4234
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m593s[0m 3s/step - loss: 3.2954
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m666s[0m 3s/step - loss: 3.1956
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m746s[0m 3s/step - loss: 3.1099
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m834s[0m 4s/step - loss: 3.0359
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m932s[0m 4s/step - loss: 2.9690
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1037s[0m 5s/step - loss: 2.9126
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1131s[0m 5s/step - loss: 2.86

In [14]:

keras.saving.save_model(lstm_model, 'my_model.keras')

In [8]:
#Load model

model = keras.saving.load_model('my_model.keras')

In [33]:
# model.summary()

In [9]:
def find_index_to_word(predicted_word, tokenizer):
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_word:
            find_word = word
            break
        else:
            find_word = 'can not found'
    
    if find_word == 'can not found':
        return 0
    else:
        return find_word
    

In [10]:

def avoid_multiple_words(caption):
    temp = ""
    temp2 = []
    for i in caption:
        if i != " ":
            temp += i
        elif i == " ":
            temp2.append(temp)
            temp=""

    sentence = []
    for i in temp2:
        if i not in sentence:
            sentence.append(i)
            
    final_text = ''
    for i in sentence:
        if i == 'startseq':
            continue
        final_text+=i
        final_text+=" "
            
    return(final_text)

In [13]:

def generate_caption(img):
    image_id = img.split('.')
    image_id = image_id[0]

    img = feature_gathered[image_id]
    
    full_text = 'startseq'
    for i in range(maximum_length):

        seq = tokenizer.texts_to_sequences([full_text])[0]
        seq = pad_sequences([seq], maxlen=maximum_length)
        pred_word = np.argmax(model.predict([img, seq], verbose=0))
    
        find_word = find_index_to_word(pred_word, tokenizer)

        if find_word == 0:
            print("Sorry, Can not find word..!")
            break
        elif find_word == 'endseq':
            break

        full_text += " " + find_word
        
        if find_word is None:
            break
        # append word as input for generating next word
        full_text += " " + find_word
        # stop if we reach end tag
        if find_word == 'endseq':
            break
        
    temp = avoid_multiple_words(full_text)
        
    return temp




In [15]:
#Enter image name:
caption = generate_caption('35506150_cbdb630f4f.jpg')
print(caption)

cushion speaking pointed four them 
