In [25]:
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img
from keras.utils import to_categorical, plot_model
from nltk.translate.bleu_score import corpus_bleu
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.applications.vgg16 import VGG16
from keras.models import load_model
from IPython.display import display
from keras.layers.merge import add
from numpy import array, argmax
from keras.models import Model
from pickle import dump, load
from PIL import Image
import string


In [2]:
def extract_features(directory):
    model = VGG16()    #Load Model(Model used to extract features from image)
    model.layers.pop()    #Make model editable
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)    #Remove last layer because its used to classify images.
    print(model.summary())
    features = dict()
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        print(image.shape)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))    #Reshape for model
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)    #Getting Features
        image_id = name.split('.')[0]
        features[image_id] = feature
        print('>%s' % name)
    return features

In [3]:
def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]    #First token is ImageID and second token is complete description
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

In [4]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]    #Convert to LowerCase
            desc = [w.translate(table) for w in desc]     #Remove Punctuation
            desc = [word for word in desc if word.isalpha()]    #Keep only Alphabets
            desc_list[i] =  ' '.join(desc)

In [5]:
def to_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [6]:
def save_descriptions(descriptions, filename):
    lines = []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [7]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [8]:
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

In [9]:
# Load Cleaned Descriptions
def load_clean_descriptions(filename, dataset):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

In [10]:
# Load Photo Features
def load_photo_features(filename, dataset):
    all_features = load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}    # Filter Features
    return features

In [11]:
# Dictionary of Clean Descriptions -> List of Descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [12]:
# Fit a Tokenizer Given Caption Descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [46]:
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        # Encode the Sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            # Split into Input and Output Pair
            in_seq, out_seq = seq[:i], seq[i]
            # Pad Input Sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # Encode Output Sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

In [14]:
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [15]:
# Define the Captioning Model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(4096,))    # Feature Extractor Model
    dropoutlayer1 = Dropout(0.5)(inputs1)
    dropoutlayer2 = Dense(256, activation='relu')(dropoutlayer1)
    inputs2 = Input(shape=(max_length,))
    embedding1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    dropoutlayer3 = Dropout(0.5)(embedding1)
    long_short_term_memory_layer = LSTM(256)(dropoutlayer3)
    decoder1 = add([dropoutlayer2, long_short_term_memory_layer])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [16]:
def data_generator(descriptions, photos, tokenizer, max_length):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]

In [None]:
#Extract Features using VGG Model
directory = r'Flicker8k_Dataset'
features = extract_features(directory)

In [None]:
#Store the features
print('Extracted Features: %d' % len(features))
dump(features, open(r'features.pkl', 'wb'))

In [17]:
#Load Descriptions
text_filepath = r'Flickr8k_text\Flickr8k.token.txt'
doc = open(text_filepath, 'r').read()


In [18]:
#Parse Descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [19]:
#Construct a Vocabulary of words
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 9630


In [20]:
# save descriptions
save_descriptions(descriptions, r'descriptions.txt')

In [21]:
# load training dataset (6K)
filename = r'Flickr8k_text\Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions(r'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features(r'features.pkl', train)
print('Photos: train=%d' % len(train_features))

Dataset: 6000
Descriptions: train=6000
Photos: train=6000


In [22]:
# Prepare Tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 7378


In [23]:
# Determine the Maximum Sequence Length
max_length1 = max_length(train_descriptions)
print('Description Length: %d' % max_length1)

# Define the Model
model = define_model(vocab_size, max_length1)

epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length1)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save(r'Models\model_' + str(i) + '.h5')

Dataset: 6000
Descriptions: train=6000
Photos: train=6000
Vocabulary Size: 7378
Description Length: 40
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40, 256)      1888768     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           input_1[0][0]                    
______

In [26]:
# Map an Integer to a Word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [41]:
# Generate a Description for an Image
def generate_desc(model, tokenizer, photo, maximum):
    in_text = 'startseq'
    for i in range(maximum):
        # Integer Encode Input Sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=maximum)
        # Predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # Map integer to word
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        # Stop if we Predict the End of the Sequence
        if word == 'endseq':
            break
    return in_text

In [42]:
# Evaluate Model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    for key, desc_list in descriptions.items():
        # Generate Description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    # Calculate BLEU(Bilingal Evaluation Understudy) score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [43]:
# Load Test Set
filename = r'Flickr8k_text\Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))

# Descriptions
test_descriptions = load_clean_descriptions(r'descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))

# Photo Features
test_features = load_photo_features(r'features.pkl', test)
print('Photos: test=%d' % len(test_features))

Dataset: 1000
Descriptions: test=1000
Photos: test=1000


In [44]:
# Load the Model
filename = r'Models\model_19.h5'
model = load_model(filename)
# Evaluate Model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length1)

BLEU-1: 0.500167
BLEU-2: 0.269452
BLEU-3: 0.176267
BLEU-4: 0.081760


In [48]:
# load training dataset (6K)
filename = r'Flickr8k_text\Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions(r'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
# save the tokenizer
dump(tokenizer, open(r'tokenizer.pkl', 'wb'))

Dataset: 6000
Descriptions: train=6000


# RUN FROM HERE IF YOU HAVE TRAINED A MODEL BEFORE AND HAVE SAVED THE TOKENIZER

In [61]:
# Load the Tokenizer
tokenizer = load(open(r'tokenizer.pkl', 'rb'))
# pre-define the max sequence length (from training)
max_length = 40

In [62]:
model = load_model(r'Models\model_19.h5')

In [63]:
def extract_features(filename):
    model = VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    image = load_img(filename, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    return feature

In [64]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [80]:
photo = extract_features(r'C:\Users\aksha\Desktop\example2.jpeg')

In [81]:
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

startseq a man in a black shirt and a black shirt is standing in front of a large building endseq
