<h2> Do not run all the cells if you already have the files</h2>

<h3> Importing libraries</h3>


In [1]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from pickle import load

Using TensorFlow backend.


<h3>Extract the features from each photo in our directory</h3>

In [None]:
def extract_features(directory):
    # load the model
    model = VGG16()
    # modify the model
    model.layers.pop()
    "We dont need the last layer of  prediction layer"
    model = Model(inputs = model.inputs, outputs = model.layers[-1].output)
    print(model.summary())
    # Extracting features from each photo
    features = dict()
    for name in listdir(directory):
        # load an image 
        filename = directory + '/' + name
        image = load_img(filename, target_size(224,224))
        # convert the image pixels to a numpy array
        image = img_to_array(iamge)
        # reshape the data for the model
        image = image.reshape((1,image.shape[0], image.shape[1], image.shape[2]))
        # prepare the image for the VGG model
        image = preprocess_input(image)
        # get the features
        features = model.predict(image, verbose = 0)
        # get image id
        image_id = name.split('.')[0]
        # store features
        features[image_id] = feature
        # showing the file names
        print('--> %s ' % name)
    return features

# extract features from all images
directory = 'YOUR DIRECTORY'
features = extract_features(directory)
print('extracted features len: %d' % len(features))
# Save those to a file
dump(features, open('features.pkl', 'wb'))    

<h3>Preparing the Text Data</h3>

In [2]:
# Load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all the text
    text = file.read()
    # Close the file
    file.close()
    return text

filename= "D:\Ongoing work\PROJECTS\Image Captioning\Flickr8k.token.txt"

# Load descriptions
doc = load_doc(filename)


In [3]:
# Extract description for images
def load_descriptions(doc):
    mapping = dict()
    # Process lines
    for line in doc.split('\n'):
        # Split the line by whitespace
        tokens = line.split()
        if (len(line) < 2):
            continue
        # Take the first toekn as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # Remove filename from image id
        image_id = image_id.split('.')[0]
        # Convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

#Parse descriptinos
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
    

Loaded: 8092 


<h3>Next we need to clean the description text.</h3> <br>
<p>Convert all the words to lowercase<br>
Remove all punctuation<br>
Remove all the words that are one character or less in length like 'a' <br>
Remove all the words with numbers in them<br></p>

In [4]:
import string

def clean_descriptions(descriptions):
    # Prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            #tokenize
            desc = desc.split()
            #Convert to lower case
            desc = [word.lower() for word in desc]
            #Remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # Remove hanging 's' and 'a' 
            desc = [word for word in desc if len(word) > 1]
            # Remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # Store as string
            desc_list[i] = ' '.join(desc)
    
# Clean description
clean_descriptions(descriptions)

In [5]:
# Lets find out the size of our vocabulary
# means lets convert the loaded discriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # Build a list of all desctiption strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

# Summarizing the vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocavulary Size : %d' % len(vocabulary))

Vocavulary Size : 8763


<h3>!! Do not run this cell if you already have the descriptions.txt file !! </h3>
<p> It will take time to generate 'descriptions.txt' file</p>

In [None]:
# Saving our filtered image id and descriptions in a file
def save_descriptions(descriptions, filename):
    counter = 1
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
        data = '\n'.join(lines)
        print(lines)
        file = open(filename, 'w')
        file.write(data)
        file.close()
# Saving descriptions
save_descriptions(descriptions, 'descriptions.txt')

<h2> Developing the Deep Learning Model </h2>

In [6]:
from numpy import array
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input,Dense, LSTM, Embedding, Dropout,GRU
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint


<h3> Loading the data</h3>

In [7]:
# Load doc to memory

def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all the texts 
    text = file.read()
    #Close the file
    file.close()
    return text

# Load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # Process line by line
    for line in doc.split('\n'):
        # Skip empty lines
        if (len(line) < 2):
            continue
        # Get image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

doc = load_doc('descriptions.txt')

In [8]:
# get the clean descriptions from our dataset
def load_clean_descriptions(filename, dataset):
    # Load the document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # Split line by white space
        tokens = line.split()
        # Split id from desctiptions just like before
        image_id, image_desc = tokens[0], tokens[1:]
        # Skip images not in the set
        if image_id in dataset:
            # Create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # Wrap descriptions in tokens
            desc = 'startseq' + ' '.join(image_desc) + ' endseq'
            # Store 
            descriptions[image_id].append(desc)
    return descriptions


In [9]:
# Load photo features
def load_photo_features(filename, dataset):
    # loading features
    all_features = load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}
    return features

In [10]:
# Load training dataset

filename= 'D:\Ongoing work\PROJECTS\Image Captioning\Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# Descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: Train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('photos: train = %d ' %len(train_features))


Dataset: 6000
Descriptions: Train=6000
photos: train = 6000 


In [13]:
# Convert a dictionary of clean descriptions to a list of descriptions
from keras.preprocessing.text import Tokenizer

def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# Fit a tokenizer given caption descriptions 
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# Prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size %d' % vocab_size)

Vocabulary Size 4142


In [12]:
# get the lenght of the description having max len
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

max_length = max_length(train_descriptions)
print(max_length)

29


<h4> Using data Generator for loading data to model</h4>

In [14]:
def data_generator(descriptions, photos, tokenizer, max_length):
    # Loop over all images
    while 1:
        for key, desc_list in descriptions.items():
            # getting the photo features
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]            
            

In [15]:
# Creating sequences of image, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # Encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # Split one sequence into multiple X, y pairs
        for i in range(1, len(seq)):
            # Split into output and input pair
            in_seq, out_seq = seq[:i], seq[i]
            # Pad input sequence
            in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
            # encode output sequenc
            out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
            # Store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)
            
    

In [19]:
# Testing the data generator
generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(53, 4096)
(53, 39)
(53, 7793)


<h3> Defining the captioning model</h3>

In [20]:
def define_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(Inputs1)
    fe2 = Dense(256, activation = 'relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length))
    se1 = Embedding(vocab_size, 256, mask_zero = True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # Decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation = 'relu')(decoder1)
    outputs = Dense(vocab_size, activation= 'softmax')(decoder2)
    # adding it together [img, seq][word]
    model = Model(inputs=[inputs1, inputs2], outputs= outputs)
    # compile the model
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    # summarize model
    model.summary()
    return model

In [21]:
# load training dataset (6K)
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
print(max_length)

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7793
39


In [None]:
# Fit the training model and dump its parameters

# define the model
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoch
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
    model.save('model_' + str(i) + '.h5')

<h3>Evaluate the model</h3>

In [22]:
# mapping an integer to a word (opposite of tokenization)
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None


In [24]:

# generating a description for an image
def generate_desc(model, tokenizer,photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # interate over the whole length of sequence
    for i in range(max_length):
        #integer encode inptut sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo, sequence], verbose = 0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if it cannot map the word
        if word is None:
            break
        
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if it predict the end of sequence
        if word == 'endseq':
            break
    return in_text
    

<h3>Using BLEU Score to evaluate the model</h3>

In [29]:
from numpy import argmax
from nltk.translate.bleu_score import corpus_bleu
from keras.models import load_model
import itertools

In [31]:
# def evaluate_model(model, descriptions, photos, tokenizer, max_length):
#     actual, predicted = list(), list()
#     count = 1 # just to see the progress
#     # go through the whole set
#     for key, desc_list in descriptions.items():
#         print(count)
#         # generate descriptions
#         yhat = generate_desc(model, tokenizer, photos[key], max_length)
#         # store actual and predicted
#         original_desc = [d.split() for d in desc_list]
#         actual.append(original_desc)
#         predicted.append(yhat.split())
#         count += 1
#     # Calculating the BLUE score
#     # Calculating bleu score
#     print('BLEU-1 :-> %f ' %corpus_bleu(actual, predicted, weights = (1.0,0,0,0)))
#     print('BLEU-2 :-> %f ' %corpus_bleu(actual, predicted, weights = (0.5,0.5,0,0)))
#     print('BLEU-3 :-> %f ' %corpus_bleu(actual, predicted, weights = (0.3,0.3,0.3,0)))
#     print('BLEU-4 :-> %f ' %corpus_bleu(actual, predicted, weights = (0.25,0.25,0.25,0.25)))

In [48]:
# EVALUATE THE SKILL OF THE MODEL
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		actual.append(i.split() for i in desc)
		predicted.append(i.split() for i in yhat)
		print('Actual:    %s' % desc[1])
		print('Predicted: %s' % yhat)
		if len(actual) >= 5:
			break
	# calculate BLEU score
	bleu = corpus_bleu(actual, predicted)
	return bleu

['w']

In [45]:
# Preparing the test set
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\Flickr_8k.testImages.txt'
test= load_set(filename)
print('Data set : %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test= %d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test= %d' % len(test_features))


Data set : 1000
Descriptions: test= 1000
Photos: test= 1000


<h2>TESTING</h2>

In [50]:
train_results, test_results = list(), list()

for i in range(2):
    # define the model
    model = load_model('model_19.h5')
    # evaluate model on training data and testing data
#     train_score = evaluate_model(model, train_descriptions, train_features, tokenizer, max_length)
    test_score = evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)
#     print(train_score)
    print(test_score)
    # store
#     train_results.append(train_score)
    test_results.append(test_score)
    print('>%d: train=%f test=%f' % ((i+1), train_score, test_score))

    

    
    

Actual:    startseqA blond woman is on the street hailing a taxi . endseq
Predicted: startseq man in black shorts is standing next to a park endseq
Actual:    startseqA boy smiles for the camera at a beach . endseq
Predicted: startseq donut in water endseq
Actual:    startseqA man and a woman are sitting on a dock together . endseq
Predicted: startseq brother in a red shirt is riding a skateboard on a bench endseq
Actual:    startseqa large white dog lying on the floor . endseq
Predicted: startseq a dog running through the grass endseq
Actual:    startseqA little boy in orange shorts playing with a toy . endseq
Predicted: startseq lines to man is standing on a bench endseq


TypeError: object of type 'generator' has no len()

In [None]:
# Preparing the test set
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\Flickr_8k.testImages.txt'
test= load_set(filename)
print('Data set : %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test= %d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test= %d' % len(test_features))

# load the model
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\model_19.h5'

# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

In [34]:
# Preparing the test set
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\Flickr_8k.testImages.txt'
test= load_set(filename)
print('Data set : %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test= %d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test= %d' % len(test_features))

# load the model
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\model_19.h5'

# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

Data set : 1000
Descriptions: test= 1000
Photos: test= 1000


AttributeError: 'list' object has no attribute 'split'

In [None]:
from keras.preprocessing.text import Tokenizer
from pickle import dump

In [None]:
# Convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# Loading the training dataset
filename = 'D:\Ongoing work\PROJECTS\Image Captioning\Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset : %d' %len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('descriptions: train: %d' % len(train_descriptions))
#prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))