# **Image Caption Generation** 

In [None]:
import os
import numpy as np 
import pickle
from tqdm.notebook import tqdm # giving us a UI on how much data is processed

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences  
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding, Dropout, add

I have used the pad_sequence in order to even out all the texts available in the dataset. Like Some sentences will have 5 words and others will have some where around 10, then we will be having an unbalanced data for input. Thus using pad_sequence will probably help us to avoid this problem.
I have used plot_model, this will give us the clear represenation of the whole model in terms of an image.
'tqdm' can help you create progress bars for data processing, training machine learning models, multi-loop Python function, and downloading data from the internet.

In [None]:
BASE_DIR = 'E:\Personal Project\Image Caption Generator\data'
WORKING_DIR = 'E:\Personal Project\Image Caption Generator\working'

## Extract Image Feature

In [None]:
model = VGG16()

#restructure our VGG16 model
model = Model(inputs=model.inputs,outputs=model.layers[-2].output)

print(model.summary())

Here we have reconstructed because, we don't need the prediction layer of the VGG16 model. We just need the rest of the modele except the prediction part and thus we have reconstructed the model 

In [None]:
#extract features from image
features = {}
directory= os.path.join(BASE_DIR, 'Images')

#iterating through all the images in the directory
for img in tqdm(os.listdir(directory)):
    # load the image from file
    imgpath = directory + '/' + img
    image = load_img(imgpath, target_size=(224,224))
    #Convert into numpy array
    image = img_to_array(image)
    #reshape
    image = image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
    #preprocess for VGG16
    image = preprocess_input(image)
    #extract features
    feature = model.predict(image,verbose=0)
    #get image id
    image_id = img.split('.')[0]
    # store featues
    features[image_id] = feature

The verbose is zero inorder to say that there is no additional texts and will be clean.

In [None]:
#store features in pickle
pickle.dump(features,open(os.path.join(WORKING_DIR, 'features.pickle'),'wb'))

In [None]:
#load features from pickle
with open(os.path.join(WORKING_DIR,'features.pickle'),'rb' ) as f:
    features = pickle.load(f)

## Load the Captions Data

In [None]:
with open(os.path.join(BASE_DIR,'captions.txt'), 'r') as f:
    next(f)
    caption = f.read()

In [None]:
caption

In [None]:
#Create mapping image to Caption
mapping = {}
for line in tqdm(caption.split('\n')):
    #split the line captions by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue # We are doing this because, since the len is less than 2 then we don't need this or optional one 
    image_id, cap = tokens[0], tokens[1:]
    #remove extension from image ID
    image_id = image_id.split('.')[0]
    #Convert caption list into string
    cap = " ".join(cap)
    # Create list for multiple captions available for the same image
    if image_id not in mapping:
        mapping[image_id] = []
    #Store caption
    mapping[image_id].append(cap)

In [None]:
len(mapping)

## Preprocessing Text

In [None]:
def clean(map):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            #preprocessing steps
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]','') # Removing all the numbers and characters
            caption = caption.replace('\s+',' ') # If there are multiple spaces, then we will just replace with single space 
            # Add start and end tags which would help us to stop the prediction and also we are neglecting terms like small words like a, is etc
            caption = 'startseq ' + ' '.join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
#before preprocess the text
mapping['1000268201_693b08cb0e']

In [None]:
# After Preprocessing the text
clean(mapping)
mapping['1000268201_693b08cb0e']

Now here we can see that the short words such as A,I,etc are eliminated just like the work we do using Stopwords but then stopwords will also elminate other factors which are not having much weightage for the word. 

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
all_captions[:10]

## Tokenization of Captions

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
voc_size = len(tokenizer.word_index)+1

In [None]:
voc_size

In [None]:
#get max len of caption available because we are gonna use this in padding the sequence
max_len = max(len(caption.split()) for caption in all_captions)

In [None]:
max_len

## Train Test Split

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

Now we will create a data generator in order to fetch image and caption like in batch size so that it will be easy for us to load into the model and train it orelse it consumes more RAM

Here during after tokenizing, we will split into X and y such that initially X will have nothing while y have the first word in the sentence, then further the word present in y is moved to X and y will get the second word of the sentense

In [None]:
#creating data generator (Avoids session from crashing)
def  data_generator(data_keys,mapping,features,tokenizer,max_len,voc_size,batch_size):
    #loop over images 
    X1, X2,y = list(),list(),list()
    n = 0 #used to determine whether we reach the batch size = 0
    
    while 1:
        for key in data_keys:
            n +=1
            captions = mapping[key]
            for caption in captions:
                #encode the sequence
                sequence = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X,Y parts
                for i in range (1,len(sequence)):
                    in_sequence, out_sequence = sequence[:i], sequence[i]
                    #pad input sequence in order to have a common length
                    in_sequence = pad_sequences([in_sequence],maxlen=max_len)[0]
                    #encode output sequence
                    out_sequence = to_categorical([out_sequence], num_classes=voc_size)[0]
                    # The Categorical will be converting the word into one hot encoding
                    
                    #store the sequences
                    X1.append(features[key][0])
                    X2.append(in_sequence)
                    y.append(out_sequence)
            if n == batch_size:
                X1,X2,y = np.array(X1), np.array(X2), np.array(y)
                yield [X1,X2],y
                X1, X2,y = list(),list(),list()
                n = 0

Here X1 and X2 are the input features and y will be the target or output. Here previous we are iterating in the for loop until n is equal to the batch size and once it's equal then we are reseting X1,X2,y values for the next loop.

## Model Creation

In [None]:
#Encoder model
# image feature layers
inputs1 = Input(shape=(4096,))
feature1 = Dropout(0.4)(inputs1)
feature2 = Dense(256,activation='relu')(feature1)
# Text feature layer
inputs2 = Input(shape=(max_len,))
se1 = Embedding(voc_size,256,mask_zero=True)(inputs2) # Since we are padding the sentence we take mask Zero
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

#Decoder model
decoder1 = add([feature2,se3])
decoder2 = Dense(256,activation='relu')(decoder1)
outputs = Dense(voc_size, activation = "softmax")(decoder2)

model = Model(inputs=[inputs1,inputs2],outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer ='adam')

#plot the model.
plot_model(model, show_shapes=True)

## Train the model

In [None]:
epochs = 2000
batch_size = 32
steps = len(train)//batch_size #After each step the system will do back propagation

for i in range(epochs):
    generator = data_generator(train,mapping,features,tokenizer,max_len,voc_size,batch_size)
    model.fit(generator, epochs =1, steps_per_epoch=steps, verbose =1) 


## Save the model

In [None]:
model.save(WORKING_DIR+'/image_caption_generator.h5')

## Generation Captions for the Image

Initially we have to convert the index to the word.

In [None]:
def idx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def predict_caption(model,image,tokenizer,max_length):
    in_text ='startseq'
    #once started, it should iterate over the max length of sequence 
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #pad the sequence
        sequence = pad_sequences([sequence],max_length)
        #predict next word
        y1 = model.predict([image,sequence],verbose=0)
        # get index with high probability 
        y1 = np.argmax(y1) #argmax will give us the maximum probability
        # convert index to word
        word = idx_to_word(y1,tokenizer)
        #stop if word not found
        if word is None:
            break
        #append word as i/p for generating next word
        in_text+= " " + word
        if word == 'endseq':
            break
    return in_text

## Validate the Text Data

In [None]:
from nltk.translate.bleu_score import corpus_bleu
actual, predicted = list(), list()

for key in tqdm(test):
    captions = mapping[key] # Actual Caption
    y_pred = predict_caption(model,features[key],tokenizer,max_len)
    
    act_cap = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    
    actual.append(act_cap)
    predicted.append(y_pred)
    
# Calculate BLEU score
print('BLEU-1: %f' %corpus_bleu(actual,predicted,weigths=(1.0,0,0,0)))
print('BLEU-2: %f' %corpus_bleu(actual,predicted,weigths=(0.5,0.5,0,0)))

## Visualize the Results

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
image_name = ""
image_id = image_name.split('.')[0]
img_path = os.path.join(BASE_DIR,"Images", image_name)
image = Image.open(img_path)
captions = mapping[image_id]
print('--------------Actual-------------')
for caption in captions:
    print(caption)
ypred = predict_caption(model,feature[image_id], tokenizer, max_len)
print('--------------Predicted-------------')
print(ypred)
plt.imshow(image)