# **Generating a caption for a given image is a challenging problem in the deep learning domain. In this article, we will use different techniques of computer vision and NLP to recognize the context of an image and describe them in a natural language like English. we will build a working model of the image caption generator by using CNN.
# #  For training our model I’m using ***Flickr8K*** **dataset. It consists of 8000 unique images and each image will be mapped to five different sentences which will describe the image****

# Step 1: Import the required libraries

In [1]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [2]:
BASE_DIR= '/kaggle/input/flickr8k'
WORKING_DIR= '/kaggle/working'

# Extract Image Features

In [3]:
#load vgg16 model
model =VGG16()
#restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
#summarize
print(model.summary())

In [4]:
#extract featurers from image
features= {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory)):
    #load the image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(224,224))
    #convert image pixel to numpy array
    image = img_to_array(image)
    #reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    #preprocess image for vgg
    image = preprocess_input(image)
    #extract features
    feature= model.predict(image, verbose=0)
    #get image ID
    image_id = img_name.split('.')[0]
    #store feature
    features[image_id] = feature
    

In [5]:
#store feature in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [6]:
#load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features= pickle.load(f)

## Load the Captions Data

In [7]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

# The format of our file is image and caption separated by a newline (“\n”) i.e, it consists of the name of the image followed by a space and the description of the image in CSV format. Here we need to map the image to its descriptions by storing them in a dictionary.

In [8]:
#create mapping of image to captions
mapping= {}
#process lines
for line in tqdm(captions_doc.split('\n')):
    #split the line by comma(,)
    tokens= line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    #remove extention from image ID
    image_id = image_id.split('.')[0]
    #convert caption list to string
    caption =" ".join(caption)
    #create lis if needed
    if image_id not in mapping:
        mapping[image_id] = []
    #store the caption
    mapping[image_id].append(caption)

In [9]:
len(mapping)

## Pre-process text data

# One of the main steps in NLP is to remove noise so that the machine can detect the patterns easily in the text. Noise will be present in the form of special characters such as hashtags, punctuation and numbers. All of which are difficult for computers to understand if they are present in the text. So we need to remove these for better results. Additionally, you can also remove stop words and perform Stemming and Lemmatization by using NLTK library.

In [10]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            #take one caption at a time
            caption= captions[i]
            #preprocessing steps
            caption = caption.lower()
            #delet digit , special characters ,etc,...
            caption=caption.replace('[^A-Za-z]','')
            #delet additional spaces
            caption = caption.replace('\s+','')
            #add start and end tags to the caption
            caption= 'start' + " ".join([word for word in caption.split() if len(word)>1]) + 'end'
            captions[i] = caption
            
            

# Load the images
# 
# Here we need to map the images in the training set to their corresponding descriptions which are present in our descriptions variable. Create a list of names of all training images and then create an empty dictionary and map the images to their descriptions using image name as key and a list of descriptions as its value. while mapping the descriptions add unique words at the beginning and end to identify the start and end of the sentence.

In [11]:
#before preprocess of text
mapping['1000268201_693b08cb0e']

In [12]:
#preprocess the text
clean(mapping)

In [13]:
#after preprocess of text
mapping['1000268201_693b08cb0e']

In [14]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [15]:
len(all_captions)

In [16]:
all_captions[:10]

In [17]:
#tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size= len(tokenizer.word_index) +1

In [18]:
vocab_size

In [19]:
#get maximum lenght of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length


## Train test split

In [20]:
image_ids = list(mapping.keys())
split =int(len(image_ids)*0.90)
train = image_ids[:split]
test = image_ids[split:]

# GloVe stands for global vectors for word representation. It is an unsupervised learning algorithm developed by Stanford for generating word embeddings by aggregating global word-word co-occurrence matrix from a corpus. Also, we have 8000 images and each image has 5 captions associated with it. It means we have 30000 examples for training our model. As there are more examples you can also use data generator for feeding input in the form of batches to our model rather than giving all at one time. For simplicity, I’m not using this here.
# 
# Also, we are going to use an embedding matrix to store the relations between words in our vocabulary. An embedding matrix is a linear mapping of the original space to a real-valued space where entities will have meaningful relationships

In [21]:
#create data generator to get data in batch (avoid session crash)
def data_generator(data_keys, mapping, features, tokenizer,max_length, vocab_size, batch_size):
    #loop over images
    x1, x2 ,y =list(), list(), list()
    n=0
    while 1:
        for key in data_keys:
            n += 1
            captions =mapping[key]
            #process each caption
            for caption in captions:
                #encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                #split the sequenc into x, y pairs
                for i in range(1 ,len(seq)):
                    #split into input and output pairs
                    in_seq ,out_seq = seq[:i], seq[i]
                    # pad input sequences
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    #encode output sequence
                    out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
                    
                    
                    #store the sequences
                    x1.append(features[key][0])
                    x2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                x1, x2, y = np.array(x1), np.array(x2), np.array(y)
                yield [x1, x2], y
                x1, x2, y =list(), list(), list()
                n=0
                
        
    

## Model Creation


# For defining the structure of our model, we will be using the Keras Model from Functional API. It has three major steps:

# Processing the sequence from the text
# Extracting the feature vector from the image
# Decoding the output by concatenating the above two layers

In [22]:
#encoder model
#image feature layers
input1= Input(shape=(4096,))
fe1= Dropout(0.4)(input1)
fe2= Dense(256, activation='relu')(fe1)

#sequence feature layers
input2= Input(shape=(max_length,))
se1= Embedding(vocab_size, 256, mask_zero=True)(input2)
se2= Dropout(0.4)(se1)
se3= LSTM(256)(se2)

#decoder model
decoder1= add([fe2,se3])
decoder2= Dense(256, activation= 'relu')(decoder1)
outputs= Dense(vocab_size, activation='softmax')(decoder2)

model= Model(inputs=[input1, input2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer= 'adam')

#plot the model
plot_model(model, show_shapes=True)

# For training our model I’m using Adam’s optimizer and loss function as categorical cross-entropy. I’m training the model for 50 epochs which will be enough for predicting the output. In case you have more computational power (no. of GPU’s) you can train it by decreasing batch size and increasing number of epochs.

In [23]:
#train the model
epochs= 15
batch_size= 64
steps =len(train) // batch_size

for i in range(epochs):
    #create data generator
    generator= data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    #fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)

In [24]:
#save the model
model.save(WORKING_DIR+ '/best_model.h5')

## Generate Captions for the images

In [25]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [26]:
#generate caption for an image
def predict_caption(model, image, tokenizer, max_lenght):
    #add start tag for generation process
    in_text = 'start'
    #iterate over the max lenght of sequences
    for i in range(max_length):
        #encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #pad the sequences
        sequence = pad_sequences([sequence], max_length)
        #predict next word
        yhat = model.predict([image, sequence], verbose=0)
        #get index with high probability
        yhat = np.argmax(yhat)
        #convert index to word
        word = idx_to_word(yhat, tokenizer)
        #stop if word not found
        if word is None:
            break
        #append word as input for generating next word
        in_text += " " + word
        #stop if we reach and tag
        if word == 'end':
            break
    return in_text
        

In [27]:
from nltk.translate.bleu_score import corpus_bleu
#validate with test data
actual, predicted = list(),list()

for key in tqdm(test):
    #get actual caption
    captions = mapping[key]
    #predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    
    #split into words
    actual_captions=[caption.split() for caption in captions]
    y_pred = y_pred.split()
    
    #append to the litst
    actual.append(actual_captions)
    predicted.append(y_pred)
    

#calculate BLEU score
print("BLUE-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLUE-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

## Visualize the results

In [28]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    #Load the image
    #image_name = "1003163366_44323f5815.jpg"
    image_id= image_name.split('.')[0]
    image_path= os.path.join(BASE_DIR, "Images", image_name)
    image = Image.open(image_path)
    captions= mapping[image_id]
    print('---------------------Actual-------------------')
    for caption in captions:
        print(caption)
    #predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('----------------------Predicted----------------------')
    print(y_pred)
    plt.imshow(image)
    

In [29]:
generate_caption("1020651753_06077ec457.jpg")