#### Max_Length
Since there are 5 captions for each image and we have preprocessed and encoded them in below format

"startseq " + caption + " endseq"

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pickle import load,dump
from numpy import array

In [2]:
# load the saved dictionary
with open("descriptions.pkl","rb") as f:
    descriptions = load(f)

In [3]:
descriptions["2394267183_735d2dc868"]

['startseq a dog goes through an obstacle course while a person looks on  endseq',
 'startseq a dog is going through a slalom style obstacle course  endseq',
 'startseq a dog performs a slalom like obstacle while the owner walks along side  endseq',
 'startseq a dog plays with a man by running around poles  endseq',
 'startseq the woman is training a white dog to zigzag through metal poles  endseq']

In [4]:
# train descriptions dictionary creation
train_descriptions = dict()
with open("data/Flickr8k_text/Flickr_8k.trainImages.txt","r") as f:
    data = f.read()
    
try:
    for el in data.split("\n"):
        tokens = el.split(".")
        image_id = tokens[0]
        if image_id in descriptions:
            train_descriptions[image_id] = descriptions[image_id]
                    
except Exception as e:
    print("Exception got :- \n",e)

In [5]:
#save the file
dump(train_descriptions,open("train_descriptions.pkl","wb"))

Vocablury Preparation

In [6]:
from nltk import FreqDist

In [7]:
# creating corpus
corpus = ""

with open("train_descriptions.pkl","rb") as f:
    train_descriptions = load(f)
for ec in train_descriptions.values():
    for el in ec:
        corpus += " "+el
    

In [8]:
total_words = corpus.split()
vocabulary = set(total_words)
print("The size of vocablury is {}".format(len(vocabulary)))

The size of vocablury is 7373


In [9]:
# creating frequecny distribution of words
freq_dist = FreqDist(total_words)
freq_dist.most_common(5)

[('a', 46784),
 ('startseq', 30000),
 ('endseq', 30000),
 ('in', 14094),
 ('the', 13509)]

In [10]:
#removing least common words from vocabulary
for ew in list(vocabulary):
    if(freq_dist[ew]<10):
        vocabulary.remove(ew)

In [11]:
VOCAB_SIZE = len(vocabulary)+1
print("Total unique words after remooving less frequent word from our corpus = {}".format(VOCAB_SIZE))

Total unique words after remooving less frequent word from our corpus = 1665


In [12]:
train_descriptions

{'2513260012_03d33305cf': ['startseq a black dog is running after a white dog in the snow  endseq',
  'startseq black dog chasing brown dog through snow endseq',
  'startseq two dogs chase each other across the snowy ground  endseq',
  'startseq two dogs play together in the snow  endseq',
  'startseq two dogs running through a low lying body of water  endseq'],
 '2903617548_d3e38d7f88': ['startseq a little baby plays croquet  endseq',
  'startseq a little girl plays croquet next to a truck  endseq',
  'startseq the child is playing croquette by the truck  endseq',
  'startseq the kid is in front of a car with a put and a ball  endseq',
  'startseq the little boy is playing with a croquet hammer and ball beside the car  endseq'],
 '3338291921_fe7ae0c8f8': ['startseq a brown dog in the snow has something hot pink in its mouth  endseq',
  'startseq a brown dog in the snow holding a pink hat  endseq',
  'startseq a brown dog is holding a pink shirt in the snow  endseq',
  'startseq a dog 

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [14]:
caption_list = []
for el in train_descriptions.values():
    for ec in el:
        caption_list.append(ec)
print("The total caption present = {}".format(len(caption_list)))

The total caption present = 30000


In [15]:
token = Tokenizer(num_words=VOCAB_SIZE)
token.fit_on_texts(caption_list)

1. **Caution**: tokenizer considers only top words provided by num_words while converting to sequences. i.e. if word is not present the it skips the word.
2. But toke.index_word keeps all the record of words in whole vocabulary.So, remoove unnecessary words while using token.index_words as a dictionary.

In [16]:
# index to words are assigned according to frequency. i.e the most frequent word has index of 1
ix_to_word = token.index_word

In [17]:
for k in list(ix_to_word):
    if k>=1665:
        ix_to_word.pop(k, None)

In [18]:
word_to_ix = dict()
for k,v in ix_to_word.items():
    word_to_ix[v] = k

In [19]:
print(len(word_to_ix))
print(len(ix_to_word))

1664
1664


In [20]:
# finding the max_length caption
MAX_LENGTH = 0
temp = 0
for ec in caption_list:
    temp = len(ec.split())
    if(MAX_LENGTH<=temp):
        MAX_LENGTH = temp

In [21]:
print("Maximum caption has length of {}".format(MAX_LENGTH))

Maximum caption has length of 39


In [22]:
# please refer https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8
# data generator, intended to be used in a call to model.fit_generator()

def data_generator(descriptions, photos, MAX_LENGTH,VOCAB_SIZE, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in train_descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key]
            for desc in desc_list:
                seq = token.texts_to_sequences([desc])
                seq = seq[0]
                for i in range(1,len(seq)):
                    in_seq , op_seq = seq[:i],seq[i]
                    #converting input sequence to fix length
                    in_seq = pad_sequences([in_seq],maxlen=MAX_LENGTH,padding="post")[0]
                    # converting op_seq to vocabulary size
                    op_seq = to_categorical([op_seq],num_classes=VOCAB_SIZE)[0]
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(op_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0


1. We are using glove_vector file for word embedding as it is already avaialable with us.
2. Each sequence will be encoded [34] ==> [34x300]
3. For words which are not in glove_vector we are setting the vector to zero

In [23]:
# make sure you have the glove_vectors file
with open('glove_vectors', 'rb') as f:
    glove = load(f)
    glove_words =  set(glove.keys())

In [24]:
EMBEDDING_SIZE = 300

# Get 300-dim dense vector for each of the words in vocabulary
embedding_matrix = np.zeros((VOCAB_SIZE,EMBEDDING_SIZE))
embedding_matrix.shape

(1665, 300)

In [25]:
EMBEDDING_SIZE = 300

# Get 300-dim dense vector for each of the words in vocabulary
embedding_matrix = np.zeros(((VOCAB_SIZE),EMBEDDING_SIZE))

for word, i in word_to_ix.items():
    embedding_vector = np.zeros(300)
    if word in glove_words:
        embedding_vector = glove[word]
        embedding_matrix[i] = embedding_vector
    else:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector


In [26]:
# save the embedding matrix to file
with open("embedding_matrix.pkl","wb") as f:
    dump(embedding_matrix,f)

In [27]:
embedding_matrix.shape

(1665, 300)

In [28]:
# save the tokenizer to file
with open("token.pkl","wb") as f:
    dump(token,f)

**Training the model**

In [29]:
# load image feature extracted file
with open("train_image_extracted.pkl","rb") as f:
    train_image_extracted = load(f)

In [30]:
# reload model saved in final model ipynb
#https://stackoverflow.com/questions/42763094/how-to-save-final-model-using-keras

from keras.models import model_from_json
from keras.models import load_model

In [37]:
with open("model.json","r") as f:
    model = model_from_json(f.read())

print("Model loaded successfully")

Model loaded successfully


In [38]:
# loading the weights of the model
model.load_weights("model_weights/model_19.h5")

In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 39)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 39, 300)      499500      input_1[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 2048)         0           input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 

Total Epochs = 30

For 1st 20 epoch , batch size = 5 images ====> stpes per epoch = 1200

In [None]:
epochs = 20
batch_size = number_pics_per_bath = 5
steps = len(train_descriptions)//number_pics_per_bath

In [None]:
len(train_descriptions)

In [37]:
for i in range(epochs):
    if epochs<11:
    generator = data_generator(train_descriptions,train_image_extracted,MAX_LENGTH,VOCAB_SIZE,number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('model_weights/model_demo' + str(i) + '.h5')


Epoch 1/1


AttributeError: 'float' object has no attribute 'eval'

For last 10 epochs,

modifying learning rate to 0.0001 and batch size = 10 , steps per epoch = 600

In [35]:
batch_size = number_pics_per_bath = 10
steps = len(train_descriptions)//number_pics_per_bath
epochs = 10
# modifying learning rate to 0.0001
#https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8
model.optimizer.lr = 0.0001

In [41]:
for i in range(epochs):
    generator = data_generator(train_descriptions,train_image_extracted,MAX_LENGTH,VOCAB_SIZE,number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [42]:
model.save_weights('model_weights/model_final.h5')