# Image Caption Generator

### Prepare

In [None]:
import numpy as np
import os
import string
from pickle import dump, load
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

from keras.models import Model
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint


from nltk.translate.bleu_score import corpus_bleu
from keras.models import load_model

In [None]:
def preprocess_img(img_name):
    image = load_img(img_name, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    return image

In [None]:
def prepare_image(img_dir):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())

    features = dict()
    for name in tqdm(os.listdir(img_dir)):
        img_name = os.path.join(img_dir, name)
        image = preprocess_img(img_name)
        feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        features[image_id] = feature
    
    print('Extracted Features: %d' % len(features))
    dump(features, open('features.pkl', 'wb'))

prepare_image('Flickr8k_Dataset')

In [None]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [None]:
def prepare_text(filename):
    text = load_doc(filename)

    # Load
    desc = dict()
    for line in text.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in desc:
            desc[image_id] = list()
        desc[image_id].append(image_desc)
    
    # Clean
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in desc.items():
        for i in range(len(desc_list)):
            d = desc_list[i]
            d = d.split()
            d = [word.lower() for word in d]
            d = [w.translate(table) for w in d]
            d = [word for word in d if len(word)>1]
            d = [word for word in d if word.isalpha()]
            desc_list[i] =  ' '.join(d)
    
    # Save
    lines = list()
    for key, desc_list in desc.items():
        for d in desc_list:
            lines.append(key + ' ' + d)
    data = '\n'.join(lines)
    file = open('descriptions.txt', 'w')
    file.write(data)
    file.close()
    
prepare_text('Flickr8k_text/Flickr8k.token.txt')

### Preprocess

In [None]:
def load_datatset(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

In [None]:
def load_text(filename, dataset):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

In [None]:
def load_img_feat(filename, dataset):
    all_features = load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}
    return features

In [None]:
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def get_max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [None]:
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
train = load_datatset('Flickr8k_text/Flickr_8k.trainImages.txt')
print('Train Dataset: %d' % len(train))
train_desc = load_text('descriptions.txt', train)
print('Train Descriptions: %d' % len(train_desc))
train_feat = load_img_feat('features.pkl', train)
print('Train Images: %d' % len(train_feat))

test = load_datatset('Flickr8k_text/Flickr_8k.devImages.txt')
print('Test Dataset: %d' % len(test))
test_desc = load_text('descriptions.txt', test)
print('Test Descriptions: %d' % len(test_desc))
test_feat = load_img_feat('features.pkl', test)
print('Test Images: %d' % len(test_feat))

In [None]:
tokenizer = create_tokenizer(train_desc)
dump(tokenizer, open('tokenizer.pkl', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
max_length = get_max_length(train_desc)
print('Max description Length: %d' % max_length)

In [None]:
X1_train, X2_train, y_train = create_sequences(tokenizer, max_length, train_desc, train_feat, vocab_size)
X1_test, X2_test, y_test = create_sequences(tokenizer, max_length, test_desc, test_feat, vocab_size)

### Training 

In [None]:
def define_model(vocab_size, max_length):
    # Feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # Sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # Decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.summary()
    return model

In [None]:
def data_generator(tokenizer, max_length, descriptions, photos, vocab_size):
    while 1:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [in_img, in_seq], out_word

In [None]:
model = define_model(vocab_size, max_length)

In [None]:
epochs = 20
steps = len(train_desc)
for i in range(epochs):
    generator = data_generator(tokenizer, max_length, train_desc, train_feat, vocab_size)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps)
    model.save('model_' + str(i) + '.h5')

### Evaluation

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, max_length, photo):
    in_text = 'startseq'
    for i in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        padded_seq = pad_sequences([seq], maxlen=max_length)

        y_pred = model.predict([photo, padded_seq], verbose=0)
        y_pred = np.argmax(y_pred)
        word = word_for_id(y_pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [None]:
def evaluate_model(model, tokenizer, max_length, descriptions, photos):
    actual, predicted = list(), list()
    for key, desc_list in descriptions.items():
        y_pred = generate_desc(model, tokenizer, max_length, photos[key])
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(y_pred.split())
        
    # BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
model_name = 'model_{}.h5'.format(epochs-1)
model = load_model(model_name)
evaluate_model(model, tokenizer, max_length, test_desc, test_feat)

### Generation

In [None]:
def extract_features(img_name):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    image = preprocess_img(img_name)
    feature = model.predict(image, verbose=0)
    return feature

tokenizer = load(open('tokenizer.pkl', 'rb'))
model = load_model(model_name)
img_name = 'a.jpg'
image_feature = extract_features(img_name)
predict_desc = generate_desc(model, tokenizer, max_length, image_feature)
print("\n", predict_desc, "\n")

In [None]:
img = plt.imread(img_name)
plt.imshow(img)
predict_desc_clean = predict_desc.replace('startseq', '')
predict_desc_clean = predict_desc_clean.replace('endseq', '')
plt.title(predict_desc_clean)
plt.show()

In [None]:
'''
Inspiration
1. https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
'''