In [2]:
import tensorflow as tf
import nltk
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input
from sklearn.utils import shuffle
import numpy as np
import os
import json
import time
from tqdm import tqdm

assert tf.__version__ == "2.0.0"

PATH = "/media/jintoboy/Main Storage/Image Captioning/"


def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    height = img.shape[0]
    width = img.shape[1]
    if height != width:
        if height < width:
            offset = int((width - height) / 2)
            img = tf.image.crop_to_bounding_box(img, 0, offset, height, height)
        else:
            offset = int((height - width) / 2)
            img = tf.image.crop_to_bounding_box(img, offset, 0, width, width)
    img = tf.image.resize(img, (224, 224))
    return preprocess_input(img), image_path


def img_ids_and_captions_from_json(PATH):
    with open(PATH + "annotations/captions_train2017.json", "r") as f:
        annotations = json.load(f)

    img_locations = []
    captions = []
    for annotation in annotations["annotations"]:
        img_locations.append("{}train2017/{:012d}.jpg".format(PATH, annotation["image_id"]))
        captions.append(annotation["caption"])

    return img_locations, captions
    # partially adapted from https://www.tensorflow.org/tutorials/text/image_captioning


def cache_vgg_features(img_ids):
    unique_ids = sorted(set(img_ids))
    images = tf.data.Dataset.from_tensor_slices(unique_ids)
    images = images.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)
    extract_vgg_features_model = VGG16(include_top=False)

    for img, path in tqdm(images):
        batch_features = extract_vgg_features_model(img)
        batch_features = tf.reshape(batch_features, (batch_features.shape[0],
                                                     -1, batch_features.shape[3]))
        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy())
    # adapted from https://www.tensorflow.org/tutorials/text/image_captioning


def get_glove_embeddings(dimensions):
    # 400000 unique tokens in vocabulary
    embeddings_index = {}
    with open(os.path.join("{}glove_embeddings".format(PATH), "glove.6B.{}d.txt".format(dimensions))) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs + " 0 0 0", 'f', sep=' ')  # add 3 dimensions for <start>, <end>, and <pad>
            embeddings_index[word] = coefs
    # partially adapted from https://keras.io/examples/pretrained_word_embeddings/
    
    unk_vector = np.zeros((dimensions + 3,))
    i = 0
    for word in embeddings_index:
        unk_vector += embeddings_index[word]
        i += 1
    unk_vector /= i
    embeddings_index["<unk>"] = unk_vector
    embeddings_index["<no_glove>"] = unk_vector

    embeddings_index["<start>"] = np.zeros((dimensions + 3,))
    embeddings_index["<start>"][dimensions] = 1

    embeddings_index["<end>"] = np.zeros((dimensions + 3,))
    embeddings_index["<end>"][dimensions + 1] = 1

    embeddings_index["<pad>"] = np.zeros((dimensions + 3,))
    embeddings_index["<pad>"][dimensions + 2] = 1

    return embeddings_index


def get_glove_embedding_matrix(processed_glove_indices, tokenizer, num_words, raw_embedding_dims):
    embedding_matrix = np.zeros((num_words, raw_embedding_dims + 3))
    for word, i in tokenizer.word_index.items():
        if i >= num_words:
            continue
        embedding_matrix[i] = processed_glove_indices.get(word)
    # partially adapted from https://keras.io/examples/pretrained_word_embeddings/

    return embedding_matrix


def save_preprocessed_captions(captions, glove_embedding_index):
    glove_words = set(glove_embedding_index.keys())
    for i in range(len(captions)):
        tokens = [token.lower() if token.lower() in glove_words
                  else "<no_glove>"
                  for token in nltk.tokenize.word_tokenize(captions[i])]
        captions[i] = "<start> {} <end>".format(" ".join(tokens))
    captions_file = open("{}annotations/tokenized_captions.txt".format(PATH), "w")
    for caption in captions:
        print(caption, file=captions_file)
    captions_file.close()


def retrieve_tokenized_captions_from_file():
    file = open("{}annotations/tokenized_captions.txt".format(PATH), "r")
    lines = [line.rstrip('\n') for line in file]
    file.close()
    return lines


def save_class_labels_from_capped_sequences(capped_seqs, stop_word_tokenizer_indices):
    train_classes = [list(set([token for token in sequence if token not in stop_word_tokenizer_indices]))
                     for sequence in capped_seqs]
    labels_file = open("{}annotations/class_labels.txt".format(PATH), "w")
    for labels in train_classes:
        print(" ".join(map(str, labels)), file=labels_file)
    labels_file.close()


def retrieve_class_labels_from_capped_sequences():
    file = open("{}annotations/class_labels.txt".format(PATH), "r")
    lines = [np.fromstring(line.rstrip('\n'), int, sep=' ') for line in file]
    file.close()
    return lines


def labels_to_one_hot(labels, num_labels):
    return tf.reduce_sum(tf.one_hot(labels, depth=num_labels), axis=0)


def one_hot_to_labels(one_hot):
    where = tf.not_equal(one_hot, 0)
    indices = tf.where(where)
    value_rowids = indices[:, 0]
    values = indices[:, 1]
    ragged_tensor = tf.RaggedTensor.from_value_rowids(values, value_rowids)
    return [tensor.numpy() for tensor in list(ragged_tensor)]


def img_classifier_func(img_name, labels):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    one_hot = labels_to_one_hot(labels, num_tokens)
    return img_tensor, one_hot


def create_partial_image_classifier(num_tokens):
# VGG output is 7x7x512, but the cached files are 49 x 512
    vgg_input = tf.keras.layers.Input(shape=(49, 512))
    flattened = tf.keras.layers.Flatten()(vgg_input)
    dropout = tf.keras.layers.Dropout(0.5)(flattened)
    dense_output = tf.keras.layers.Dense(num_tokens, activation="sigmoid")(dropout)

    model = tf.keras.models.Model([vgg_input], dense_output)
    return model
# model architecture inspired by https://arxiv.org/pdf/1511.05284.pdf, https://arxiv.org/pdf/1606.07770.pdf



def compile_classifier_only_model(classifier_model):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
    classifier_model.compile(loss="binary_crossentropy", optimizer=optimizer)
    return classifier_model


def train_image_classifier(image_classifier_model, dataset, epochs):
    for epoch in range(epochs):
        for imgs, labels in dataset:
            image_classifier_model.train_on_batch([imgs], labels)


def create_partial_lstm_language_model(num_tokens, max_sequence_length, embedding_dims, embedding_matrix):
    embed_matrix = tf.convert_to_tensor(embedding_matrix, dtype=tf.float32)

    sequence_input = tf.keras.layers.Input(shape=(None,))
    embedded_sequences = tf.keras.layers.Embedding(input_dim=num_tokens,
                                                   output_dim=embedding_dims + 3,
                                                   embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                                   trainable=False)(sequence_input)

    labels_input = tf.keras.layers.Input(shape=(None, embedding_dims + 3, ))


    element_wise_sum = tf.keras.layers.Add()([embedded_sequences, labels_input])

    lstm_output = tf.keras.layers.LSTM(128,
                                       return_sequences=True,
                                       dropout=0.5)(element_wise_sum)
    dropout = tf.keras.layers.Dropout(0.5, noise_shape=(None, 1, 128))(lstm_output)
    dense = tf.keras.layers.Dense(embedding_dims + 3)(dropout)
    relu = tf.keras.layers.ReLU()(dense)
    un_embed = tf.keras.layers.Lambda(lambda x: tf.linalg.matmul(x, tf.transpose(embed_matrix)), num_tokens)(relu)
    softmax = tf.keras.layers.Softmax()(un_embed)

    model = tf.keras.models.Model([sequence_input, labels_input], softmax)
    return model
# model architecture inspired by https://arxiv.org/pdf/1511.05284.pdf, https://arxiv.org/pdf/1606.07770.pdf




def compile_language_only_model(language_model):
    def loss(labels, probabilities):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, probabilities)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
    language_model.compile(loss=loss, optimizer=optimizer)
    return language_model


def train_language_model(language_model, dataset, embedding_matrix, num_tokens, max_sequence_length, epochs):
    embed_input = tf.matmul(tf.zeros((1, max_sequence_length - 1, num_tokens)),
                            tf.convert_to_tensor(embedding_matrix, dtype=tf.float32))
    for epoch in range(epochs):
        for before, after in dataset:
            language_model.train_on_batch([before, embed_input], after)


def img_caption_func(img_name, caption):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    front, back = caption[:-1], caption[1:]
    return img_tensor, front, back


def create_image_captioning_model(img_classifier, language_model, embedding_matrix, embedding_dimensions):
    image = img_classifier.input
    image_labels = img_classifier.output
    image_labels_embedded = tf.keras.layers.Lambda(
        lambda x: tf.matmul(x, tf.convert_to_tensor(embedding_matrix, dtype=tf.float32)),
        embedding_dimensions + 3)(image_labels)
    image_labels_embedded_tiled = tf.expand_dims(image_labels_embedded, axis=1)
    sequence_input = language_model.input[0]
    caption_output = language_model([sequence_input, image_labels_embedded_tiled])

    model = tf.keras.models.Model([image, sequence_input], caption_output)
    def loss(labels, probabilities):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, probabilities)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
    model.compile(loss=loss, optimizer=optimizer)
    return model
# model architecture inspired by https://arxiv.org/pdf/1511.05284.pdf, https://arxiv.org/pdf/1606.07770.pdf



def train_image_captioning_model(image_captioning_model, dataset, epochs):
    for epoch in range(epochs):
        for image, front, back, in dataset:
            image_captioning_model.train_on_batch([image, front], back)
   

def dataset_mapping_function(classifier_img_id, classifier_label, caption_img_id, caption_sequence, language_sequence):
    classifier_img = np.load(classifier_img_id.decode('utf-8')+'.npy') 
    caption_img = np.load(caption_img_id.decode('utf-8')+'.npy') 
    label_one_hot = labels_to_one_hot(classifier_label, num_tokens)
    caption_front, caption_back = caption_sequence[:-1], caption_sequence[1:]
    language_front, language_back = language_sequence[:-1], language_sequence[1:]
    return classifier_img, label_one_hot, caption_img, caption_front, caption_back, language_front, language_back


def graph_bleu_curves(valid_bleu, car_mentions, classifier_loss, caption_loss, language_loss, iterations):
    fig = plt.figure(figsize=(8, 8))
    plt.plot(iterations, valid_bleu, label="Validation BLEU")
    plt.title("Validation BLEU Curves")
    plt.xlabel("Iteration")
    plt.ylabel("BLEU Score")
    plt.legend()
    plt.savefig("BLEU Curves.png")
    plt.close(fig)
    
    fig = plt.figure(figsize=(8, 8))
    plt.plot(iterations, car_mentions, label="Car(s) mentions in Captions")
    plt.title("Percentage of captions that correctly mention a car/cars")
    plt.xlabel("Iteration")
    plt.ylabel("Percentage")
    plt.legend()
    plt.savefig("Car Curves.png")
    plt.close(fig)
    
    fig = plt.figure(figsize=(8, 8))
    plt.plot(iterations[1:], classifier_loss, label="Classifier Cross-Entropy")
    plt.plot(iterations[1:], classifier_loss, label="Language Model Cross-Entropy")
    plt.plot(iterations[1:], classifier_loss, label="Caption Model Cross-Entropy")
    plt.title("Cross-Entropy Curves")
    plt.xlabel("Iteration")
    plt.ylabel("Cross-Entropy")
    plt.legend()
    plt.savefig("Cross-Entropy Curves.png")
    plt.close(fig)


def evaluate_caption_model(spiciness, image_captioning_model, valid_img_ids, valid_captions, max_sequence_length):
    valid_imgs = []
    for img_id in valid_img_ids:
        valid_imgs.append(np.array(np.load(img_id+'.npy')))
    valid_imgs = np.array(valid_imgs)
    indices = np.ones((len(valid_imgs), 1), dtype=np.int32) * tokenizer.word_index["<start>"]    
    for i in range(1, max_sequence_length):
        next_indices = indices[:,-1:]
        predicted = image_captioning_model([valid_imgs, next_indices])/spiciness
        # text generation partially adapted from https://www.tensorflow.org/tutorials/text/text_generation
        predicted_ids = []
        for prediction in predicted:
            predicted_id = tf.random.categorical(prediction, num_samples=1)[-1,0].numpy()
            predicted_ids.append([predicted_id])
        indices = np.hstack((indices, np.array(predicted_ids)))
    tokens = [[tokenizer.index_word[id] for id in row 
               if id != tokenizer.word_index["<pad>"]] 
              for row in indices]
    num_car_in_caption = [1 if "car" in set(sentence) or "cars" in set(sentence) else 0 for sentence in tokens]
    average_score = 0
    for i in range(len(tokens)):
        average_score += nltk.translate.bleu_score.sentence_bleu(valid_captions[i], tokens[i], weights=(1, 0, 0, 0))
    return average_score/len(tokens), sum(num_car_in_caption)/len(tokens)


def simultaneous_train(epochs, image_classifier_model, language_model, image_captioning_model, dataset, embedding_matrix, num_tokens, max_sequence_length, valid_img_ids, valid_captions):
    i = 0
    validation_bleu = [0]
    car_mentions = [0]
    classifier_losses = []
    language_model_losses = []
    caption_model_losses = []
    iterations = [0]
    valid_img_ids = np.array(valid_img_ids)
    valid_captions = np.array(valid_captions)
    for epoch in range(epochs):
        for classifier_img, label_one_hot, caption_img, caption_front, caption_back, language_front, language_back in dataset:
            i += 1
            classifier_loss = image_classifier_model.train_on_batch(classifier_img, label_one_hot)
            embed_input = tf.matmul(tf.zeros((1, max_sequence_length - 1, num_tokens)),
                                    tf.convert_to_tensor(embedding_matrix, dtype=tf.float32))
            language_model_loss = language_model.train_on_batch([language_front, embed_input], language_back)
            caption_loss = image_captioning_model.train_on_batch([caption_img, caption_front], caption_back)
                        
            manager.save()
            
            classifier_losses.append(classifier_loss)
            language_model_losses.append(language_model_loss)
            caption_model_losses.append(caption_loss)
            
            random_valid_indices = np.random.randint(0, len(valid_img_ids), 128)
            bleu_score, percent_car_mentions = evaluate_caption_model(0.001, image_captioning_model, valid_img_ids[random_valid_indices], valid_captions[random_valid_indices], max_sequence_length)
            validation_bleu.append(bleu_score)
            car_mentions.append(percent_car_mentions)
            iterations.append(i)
            graph_bleu_curves(validation_bleu, car_mentions, classifier_losses, caption_model_losses, language_model_losses, iterations)



train_img_ids, train_captions = img_ids_and_captions_from_json(PATH)
# 27548 unique tokens in captions, 22128 present in glove embedding vocabulary

# cache_vgg_features(train_img_ids)  # takes around 26 minutes with a GTX 1080 Ti to cache all images!!!
raw_embedding_dims = 50
glove_index = get_glove_embeddings(raw_embedding_dims)
# save_preprocessed_captions(train_captions, glove_index)  # takes a couple minutes to process
train_captions = retrieve_tokenized_captions_from_file()

top_n_words = 2500
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_n_words, lower=True, oov_token="<unk>",
                                                  filters='\t\n')
tokenizer.fit_on_texts(train_captions)
tokenizer.word_index["<pad>"] = 0  # index 1 is for oov tokens, 0 is for the <pad> token
tokenizer.index_word[0] = "<pad>"

train_sequences = tokenizer.texts_to_sequences(train_captions)
max_sequence_length = max(len(sequence) for sequence in train_sequences)
capped_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences,
                                                                 maxlen=max_sequence_length,
                                                                 padding="post")
# token preprocessing partially adapted from https://www.tensorflow.org/tutorials/text/image_captioning

stop_words = nltk.corpus.stopwords.words("english") + list("!\"#$%&()*+.,-/:;=?@[\]^_`{|}~ '") + ["\'s",
                                                                                                  "<no_glove>",
                                                                                                  "<start>",
                                                                                                  "<end>",
                                                                                                  "<unk>",
                                                                                                  "<pad>"]
stop_word_indices = set(tokenizer.word_index.get(stop_word) for stop_word in stop_words
                        if tokenizer.word_index.get(stop_word) is not None)

# save_class_labels_from_capped_sequences(capped_sequences, stop_word_indices)  # takes ~80 seconds to process
train_labels = retrieve_class_labels_from_capped_sequences()
num_tokens = top_n_words
max_num_labels_per_image = max(len(labels) for labels in train_labels)

train_padded_labels = tf.keras.preprocessing.sequence.pad_sequences(train_labels, maxlen=max_num_labels_per_image + 10,
                                                              padding="post", value=-1)

glove_matrix = get_glove_embedding_matrix(glove_index, tokenizer, num_tokens, raw_embedding_dims)

BATCH_SIZE = 1024
BUFFER_SIZE = 1000
# 591753 images, 118287 are unique
# image_classifier_dataset = tf.data.Dataset.from_tensor_slices((train_img_ids, train_padded_labels))
# image_classifier_dataset = image_classifier_dataset.map(lambda item1, item2:
#                                                         tf.numpy_function(img_classifier_func,
#                                                                           [item1, item2],
#                                                                           [tf.float32, tf.float32]),
#                                                         num_parallel_calls=tf.data.experimental.AUTOTUNE)
# image_classifier_dataset = image_classifier_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# image_classifier_dataset = image_classifier_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

partial_img_classifier = create_partial_image_classifier(num_tokens)
img_classifier = compile_classifier_only_model(partial_img_classifier)
# train_image_classifier(img_classifier, image_classifier_dataset, 1)

partial_language_model = create_partial_lstm_language_model(num_tokens, max_sequence_length, raw_embedding_dims, glove_matrix)
language_model = compile_language_only_model(partial_language_model)

# BATCH_SIZE = 256
# langauge_model_dataset = tf.data.Dataset.from_tensor_slices(capped_sequences)
# langauge_model_dataset = langauge_model_dataset.map(lambda sequence: (sequence[:-1], sequence[1:]))
# langauge_model_dataset = langauge_model_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# langauge_model_dataset = langauge_model_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


# train_language_model(language_model, langauge_model_dataset, glove_matrix, num_tokens, max_sequence_length, 1)


# img_caption_dataset = tf.data.Dataset.from_tensor_slices((train_img_ids, capped_sequences))
# img_caption_dataset = img_caption_dataset.map(lambda item1, item2:
#                                               tf.numpy_function(img_caption_func,
#                                                                 [item1, item2],
#                                                                 [tf.float32, tf.int32, tf.int32]),
#                                               num_parallel_calls=tf.data.experimental.AUTOTUNE)
# img_caption_dataset = img_caption_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# img_caption_dataset = img_caption_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)




image_captioning_model = create_image_captioning_model(partial_img_classifier, partial_language_model,
                                                       glove_matrix, raw_embedding_dims)
# train_image_captioning_model(image_captioning_model, img_caption_dataset, 1)

caption_img_ids, caption_sequences = list(zip(*[(id, caption) for id, caption in zip(train_img_ids, capped_sequences) 
                                                 if tokenizer.word_index["car"] not in set(caption) 
                                                 and tokenizer.word_index["cars"] not in set(caption)]))
caption_img_ids, caption_sequences = list(caption_img_ids), list(caption_sequences)

classifier_img_ids = train_img_ids.copy()
language_model_sequences = capped_sequences.copy()
num_subset_rows = len(caption_img_ids)

_ = list(zip(classifier_img_ids, train_padded_labels))
np.random.shuffle(_)
classifier_img_ids, classifier_labels = list(zip(*_[:num_subset_rows]))
classifier_img_ids, classifier_labels = list(classifier_img_ids), list(classifier_labels)
np.random.shuffle(language_model_sequences)
language_model_sequences = language_model_sequences[:num_subset_rows]

valid_img_ids, valid_sequences = list(zip(*[(id, caption) for id, caption in zip(train_img_ids, capped_sequences) 
                                                 if tokenizer.word_index["car"] in set(caption) 
                                                 or tokenizer.word_index["cars"] in set(caption)]))
valid_img_ids, valid_sequences = list(valid_img_ids), list(valid_sequences)
valid_captions = [[tokenizer.index_word[index] for index in caption if index != tokenizer.word_index["<pad>"]] for caption in valid_sequences]

In [3]:
three_way_dataset = tf.data.Dataset.from_tensor_slices((classifier_img_ids, 
                                                        classifier_labels,
                                                        caption_img_ids, 
                                                        caption_sequences,
                                                        language_model_sequences))
three_way_dataset = three_way_dataset.map(lambda item1, item2, item3, item4, item5:
                                                        tf.numpy_function(dataset_mapping_function,
                                                                          [item1, item2, item3, item4, item5],
                                                                          [tf.float32, tf.float32, tf.float32, tf.int32, tf.int32, tf.int32, tf.int32]),
                                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
# dataset API tutorial provided by https://www.tensorflow.org/tutorials/text/image_captioning
three_way_dataset = three_way_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
three_way_dataset = three_way_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [4]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(img_classifier = img_classifier, 
                                 language_model = language_model, 
                                 image_captioning_model = image_captioning_model)

#checkpointing tutorial provided by https://www.tensorflow.org/tutorials/generative/dcgan
manager = tf.train.CheckpointManager(checkpoint, directory=checkpoint_dir, max_to_keep=1)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc893f197f0>

In [None]:
simultaneous_train(10, img_classifier, language_model, image_captioning_model, three_way_dataset, glove_matrix, num_tokens, max_sequence_length, valid_img_ids, valid_captions)

In [8]:
def generate_captions(spiciness, image_captioning_model, valid_img_ids, valid_captions, max_sequence_length):
    valid_imgs = []
    for img_id in valid_img_ids:
        valid_imgs.append(np.array(np.load(img_id+'.npy')))
    valid_imgs = np.array(valid_imgs)
    indices = np.ones((len(valid_imgs), 1), dtype=np.int32) * tokenizer.word_index["<start>"]    
    for i in range(1, max_sequence_length):
        predicted = image_captioning_model([valid_imgs, indices])/spiciness
        # text generation partially adapted from https://www.tensorflow.org/tutorials/text/text_generation
        predicted_ids = []
        for prediction in predicted:
            predicted_id = tf.random.categorical(prediction, num_samples=1)[-1,0].numpy()
            predicted_ids.append([predicted_id])
        indices = np.hstack((indices, np.array(predicted_ids)))
    tokens = [[tokenizer.index_word[id] for id in row 
               if id != tokenizer.word_index["<pad>"]] 
              for row in indices]
    non_car_captions = [a for a in list(zip(valid_img_ids,valid_captions,tokens)) if "car" not in set(a[2]) and "cars" not in set(a[2])]
    car_captions = [a for a in list(zip(valid_img_ids,valid_captions,tokens)) if "car" in set(a[2]) or "cars" in set(a[2])]
    i = 1
    for caption in non_car_captions:
        if i > 25:
            break
        fig = plt.figure(figsize=(8, 8)).Embedding(input_dim=num_tokens,
                                                   output_dim=embedding_dims + 3,
                                                   embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                                   trainable=False)(sequence_input)

    labels_input = tf.keras.layers.Input(shape=(None, embedding_dims + 3, ))


    element_wise_sum = tf.keras.layers.Add()([embedded_sequences, labels_input])

    lstm_output = tf.keras.layers.LSTM(128,
                                       return_sequences=True,
                                       dropout=0.5)(element_wise_sum)
    dropout = tf.keras.layers.Dropout(0.5, noise_shape=(None, 1, 128))(lstm_output)
    dense = tf.keras.layers.Dense(embedding_dims + 3)(dropout)
    relu = tf.keras.layers.ReLU()(dense)
    un_embed = tf.keras.layers.Lambda(lamb
        img = plt.imread(caption[0])
        plt.imshow(img)
        plt.title(" ".join(caption[1]) +"\n" + " ".join(caption[2]))
        plt.axis('off')
        plt.savefig(str(i)+".jpg")
        plt.close(fig)
        i += 1
        
    i = 100
    for caption in car_captions:
        if i > 125:
            break
        fig = plt.figure(figsize=(8, 8))
        img = plt.imread(caption[0])
        plt.imshow(img)
        plt.title(" ".join(caption[1]) +"\n" + " ".join(caption[2]))
        plt.axis('off')
        plt.savefig(str(i)+".jpg")
        plt.close(fig)
        i += 1
        
generate_captions(0.001, image_captioning_model, valid_img_ids[:1024], valid_captions[:1024], max_sequence_length)

In [1]:
import tensorflow as tf
import nltk
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input
from sklearn.utils import shuffle
import numpy as np
import os
import json
import time
from tqdm import tqdm

assert tf.__version__ == "2.0.0"

PATH = "/media/jintoboy/Main Storage/Image Captioning/"


def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    height = img.shape[0]
    width = img.shape[1]
    if height != width:
        if height < width:
            offset = int((width - height) / 2)
            img = tf.image.crop_to_bounding_box(img, 0, offset, height, height)
        else:
            offset = int((height - width) / 2)
            img = tf.image.crop_to_bounding_box(img, offset, 0, width, width)
    img = tf.image.resize(img, (224, 224))
    return preprocess_input(img), image_path


def img_ids_and_captions_from_json(PATH):
    with open(PATH + "annotations/captions_train2017.json", "r") as f:
        annotations = json.load(f)

    img_locations = []
    captions = []
    for annotation in annotations["annotations"]:
        img_locations.append("{}train2017/{:012d}.jpg".format(PATH, annotation["image_id"]))
        captions.append(annotation["caption"])

    return img_locations, captions
    # partially adapted from https://www.tensorflow.org/tutorials/text/image_captioning


def cache_vgg_features(img_ids):
    unique_ids = sorted(set(img_ids))
    images = tf.data.Dataset.from_tensor_slices(unique_ids)
    images = images.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)
    extract_vgg_features_model = VGG16(include_top=False)

    for img, path in tqdm(images):
        batch_features = extract_vgg_features_model(img)
        batch_features = tf.reshape(batch_features, (batch_features.shape[0],
                                                     -1, batch_features.shape[3]))
        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy())
    # adapted from https://www.tensorflow.org/tutorials/text/image_captioning


def get_glove_embeddings(dimensions):
    # 400000 unique tokens in vocabulary
    embeddings_index = {}
    with open(os.path.join("{}glove_embeddings".format(PATH), "glove.6B.{}d.txt".format(dimensions))) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs + " 0 0 0", 'f', sep=' ')  # add 3 dimensions for <start>, <end>, and <pad>
            embeddings_index[word] = coefs
    # partially adapted from https://keras.io/examples/pretrained_word_embeddings/
    
    unk_vector = np.zeros((dimensions + 3,))
    i = 0
    for word in embeddings_index:
        unk_vector += embeddings_index[word]
        i += 1
    unk_vector /= i
    embeddings_index["<unk>"] = unk_vector
    embeddings_index["<no_glove>"] = unk_vector

    embeddings_index["<start>"] = np.zeros((dimensions + 3,))
    embeddings_index["<start>"][dimensions] = 1

    embeddings_index["<end>"] = np.zeros((dimensions + 3,))
    embeddings_index["<end>"][dimensions + 1] = 1

    embeddings_index["<pad>"] = np.zeros((dimensions + 3,))
    embeddings_index["<pad>"][dimensions + 2] = 1

    return embeddings_index


def get_glove_embedding_matrix(processed_glove_indices, tokenizer, num_words, raw_embedding_dims):
    embedding_matrix = np.zeros((num_words, raw_embedding_dims + 3))
    for word, i in tokenizer.word_index.items():
        if i >= num_words:
            continue
        embedding_matrix[i] = processed_glove_indices.get(word)
    # partially adapted from https://keras.io/examples/pretrained_word_embeddings/

    return embedding_matrix


def save_preprocessed_captions(captions, glove_embedding_index):
    glove_words = set(glove_embedding_index.keys())
    for i in range(len(captions)):
        tokens = [token.lower() if token.lower() in glove_words
                  else "<no_glove>"
                  for token in nltk.tokenize.word_tokenize(captions[i])]
        captions[i] = "<start> {} <end>".format(" ".join(tokens))
    captions_file = open("{}annotations/tokenized_captions.txt".format(PATH), "w")
    for caption in captions:
        print(caption, file=captions_file)
    captions_file.close()


def retrieve_tokenized_captions_from_file():
    file = open("{}annotations/tokenized_captions.txt".format(PATH), "r")
    lines = [line.rstrip('\n') for line in file]
    file.close()
    return lines


def save_class_labels_from_capped_sequences(capped_seqs, stop_word_tokenizer_indices):
    train_classes = [list(set([token for token in sequence if token not in stop_word_tokenizer_indices]))
                     for sequence in capped_seqs]
    labels_file = open("{}annotations/class_labels.txt".format(PATH), "w")
    for labels in train_classes:
        print(" ".join(map(str, labels)), file=labels_file)
    labels_file.close()


def retrieve_class_labels_from_capped_sequences():
    file = open("{}annotations/class_labels.txt".format(PATH), "r")
    lines = [np.fromstring(line.rstrip('\n'), int, sep=' ') for line in file]
    file.close()
    return lines


def labels_to_one_hot(labels, num_labels):
    return tf.reduce_sum(tf.one_hot(labels, depth=num_labels), axis=0)


def one_hot_to_labels(one_hot):
    where = tf.not_equal(one_hot, 0)
    indices = tf.where(where)
    value_rowids = indices[:, 0]
    values = indices[:, 1]
    ragged_tensor = tf.RaggedTensor.from_value_rowids(values, value_rowids)
    return [tensor.numpy() for tensor in list(ragged_tensor)]


def img_classifier_func(img_name, labels):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    one_hot = labels_to_one_hot(labels, num_tokens)
    return img_tensor, one_hot


def create_partial_image_classifier(num_tokens):
# VGG output is 7x7x512, but the cached files are 49 x 512
    vgg_input = tf.keras.layers.Input(shape=(49, 512))
    flattened = tf.keras.layers.Flatten()(vgg_input)
    dropout = tf.keras.layers.Dropout(0.5)(flattened)
    dense_output = tf.keras.layers.Dense(num_tokens, activation="sigmoid")(dropout)

    model = tf.keras.models.Model([vgg_input], dense_output)
    return model


def compile_classifier_only_model(classifier_model):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
    classifier_model.compile(loss="binary_crossentropy", optimizer=optimizer)
    return classifier_model


def train_image_classifier(image_classifier_model, dataset, epochs):
    for epoch in range(epochs):
        for imgs, labels in dataset:
            image_classifier_model.train_on_batch([imgs], labels)


def create_partial_lstm_language_model(num_tokens, max_sequence_length, embedding_dims, embedding_matrix):
    embed_matrix = tf.convert_to_tensor(embedding_matrix, dtype=tf.float32)

    sequence_input = tf.keras.layers.Input(shape=(None,))
    embedded_sequences = tf.keras.layers.Embedding(input_dim=num_tokens,
                                                   output_dim=embedding_dims + 3,
                                                   embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                                   trainable=False)(sequence_input)

    labels_input = tf.keras.layers.Input(shape=(None, embedding_dims + 3, ))


    element_wise_sum = tf.keras.layers.Add()([embedded_sequences, labels_input])

    lstm_output = tf.keras.layers.LSTM(128,
                                       return_sequences=True,
                                       dropout=0.5)(element_wise_sum)
    dropout = tf.keras.layers.Dropout(0.5, noise_shape=(None, 1, 128))(lstm_output)
    dense = tf.keras.layers.Dense(embedding_dims + 3)(dropout)
    relu = tf.keras.layers.ReLU()(dense)
    un_embed = tf.keras.layers.Lambda(lambda x: tf.linalg.matmul(x, tf.transpose(embed_matrix)), num_tokens)(relu)
    softmax = tf.keras.layers.Softmax()(un_embed)

    model = tf.keras.models.Model([sequence_input, labels_input], softmax)
    return model


def compile_language_only_model(language_model):
    def loss(labels, probabilities):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, probabilities)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
    language_model.compile(loss=loss, optimizer=optimizer)
    return language_model


def train_language_model(language_model, dataset, embedding_matrix, num_tokens, max_sequence_length, epochs):
    embed_input = tf.matmul(tf.zeros((1, max_sequence_length - 1, num_tokens)),
                            tf.convert_to_tensor(embedding_matrix, dtype=tf.float32))
    for epoch in range(epochs):
        for before, after in dataset:
            language_model.train_on_batch([before, embed_input], after)


def img_caption_func(img_name, caption):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    front, back = caption[:-1], caption[1:]
    return img_tensor, front, back


def create_image_captioning_model(img_classifier, language_model, embedding_matrix, embedding_dimensions):
    image = img_classifier.input
    image_labels = img_classifier.output
    image_labels_embedded = tf.keras.layers.Lambda(
        lambda x: tf.matmul(x, tf.convert_to_tensor(embedding_matrix, dtype=tf.float32)),
        embedding_dimensions + 3)(image_labels)
    image_labels_embedded_tiled = tf.expand_dims(image_labels_embedded, axis=1)
    sequence_input = language_model.input[0]
    caption_output = language_model([sequence_input, image_labels_embedded_tiled])

    model = tf.keras.models.Model([image, sequence_input], caption_output)
    def loss(labels, probabilities):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, probabilities)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
    model.compile(loss=loss, optimizer=optimizer)
    return model


def train_image_captioning_model(image_captioning_model, dataset, epochs):
    for epoch in range(epochs):
        for image, front, back, in dataset:
            image_captioning_model.train_on_batch([image, front], back)
   

def dataset_mapping_function(classifier_img_id, classifier_label, caption_img_id, caption_sequence, language_sequence):
    classifier_img = np.load(classifier_img_id.decode('utf-8')+'.npy') 
    caption_img = np.load(caption_img_id.decode('utf-8')+'.npy') 
    label_one_hot = labels_to_one_hot(classifier_label, num_tokens)
    caption_front, caption_back = caption_sequence[:-1], caption_sequence[1:]
    language_front, language_back = language_sequence[:-1], language_sequence[1:]
    return classifier_img, label_one_hot, caption_img, caption_front, caption_back, language_front, language_back


def graph_bleu_curves(valid_bleu, car_mentions, classifier_loss, caption_loss, language_loss, iterations):
    fig = plt.figure(figsize=(8, 8))
    plt.plot(iterations, valid_bleu, label="Validation BLEU")
    plt.title("Validation BLEU Curves")
    plt.xlabel("Iteration")
    plt.ylabel("BLEU Score")
    plt.legend()
    plt.savefig("BLEU Curves.png")
    plt.close(fig)
    
    fig = plt.figure(figsize=(8, 8))
    plt.plot(iterations, car_mentions, label="Car(s) mentions in Captions")
    plt.title("Percentage of captions that correctly mention a car/cars")
    plt.xlabel("Iteration")
    plt.ylabel("Percentage")
    plt.legend()
    plt.savefig("Car Curves.png")
    plt.close(fig)
    
    fig = plt.figure(figsize=(8, 8))
    plt.plot(iterations[1:], classifier_loss, label="Classifier Cross-Entropy")
    plt.plot(iterations[1:], classifier_loss, label="Language Model Cross-Entropy")
    plt.plot(iterations[1:], classifier_loss, label="Caption Model Cross-Entropy")
    plt.title("Cross-Entropy Curves")
    plt.xlabel("Iteration")
    plt.ylabel("Cross-Entropy")
    plt.legend()
    plt.savefig("Cross-Entropy Curves.png")
    plt.close(fig)


def evaluate_caption_model(spiciness, image_captioning_model, valid_img_ids, valid_captions, max_sequence_length):
    valid_imgs = []
    for img_id in valid_img_ids:
        valid_imgs.append(np.array(np.load(img_id+'.npy')))
    valid_imgs = np.array(valid_imgs)
    indices = np.ones((len(valid_imgs), 1), dtype=np.int32) * tokenizer.word_index["<start>"]    
    for i in range(1, max_sequence_length):
        next_indices = indices[:,-1:]
        predicted = image_captioning_model([valid_imgs, next_indices])/spiciness
        # text generation partially adapted from https://www.tensorflow.org/tutorials/text/text_generation
        predicted_ids = []
        for prediction in predicted:
            predicted_id = tf.random.categorical(prediction, num_samples=1)[-1,0].numpy()
            predicted_ids.append([predicted_id])
        indices = np.hstack((indices, np.array(predicted_ids)))
    tokens = [[tokenizer.index_word[id] for id in row 
               if id != tokenizer.word_index["<pad>"]] 
              for row in indices]
    num_car_in_caption = [1 if "car" in set(sentence) or "cars" in set(sentence) else 0 for sentence in tokens]
    average_score = 0
    for i in range(len(tokens)):
        average_score += nltk.translate.bleu_score.sentence_bleu(valid_captions[i], tokens[i], weights=(1, 0, 0, 0))
    return average_score/len(tokens), sum(num_car_in_caption)/len(tokens)


def simultaneous_train(epochs, image_classifier_model, language_model, image_captioning_model, dataset, embedding_matrix, num_tokens, max_sequence_length, valid_img_ids, valid_captions):
    i = 0
    validation_bleu = [0]
    car_mentions = [0]
    classifier_losses = []
    language_model_losses = []
    caption_model_losses = []
    iterations = [0]
    valid_img_ids = np.array(valid_img_ids)
    valid_captions = np.array(valid_captions)
    for epoch in range(epochs):
        for classifier_img, label_one_hot, caption_img, caption_front, caption_back, language_front, language_back in dataset:
            i += 1
            classifier_loss = image_classifier_model.train_on_batch(classifier_img, label_one_hot)
            embed_input = tf.matmul(tf.zeros((1, max_sequence_length - 1, num_tokens)),
                                    tf.convert_to_tensor(embedding_matrix, dtype=tf.float32))
            language_model_loss = language_model.train_on_batch([language_front, embed_input], language_back)
            caption_loss = image_captioning_model.train_on_batch([caption_img, caption_front], caption_back)
                        
            manager.save()
            
            classifier_losses.append(classifier_loss)
            language_model_losses.append(language_model_loss)
            caption_model_losses.append(caption_loss)
            
            random_valid_indices = np.random.randint(0, len(valid_img_ids), 128)
            bleu_score, percent_car_mentions = evaluate_caption_model(0.001, image_captioning_model, valid_img_ids[random_valid_indices], valid_captions[random_valid_indices], max_sequence_length)
            validation_bleu.append(bleu_score)
            car_mentions.append(percent_car_mentions)
            iterations.append(i)
            graph_bleu_curves(validation_bleu, car_mentions, classifier_losses, caption_model_losses, language_model_losses, iterations)



train_img_ids, train_captions = img_ids_and_captions_from_json(PATH)
# 27548 unique tokens in captions, 22128 present in glove embedding vocabulary

# cache_vgg_features(train_img_ids)  # takes around 26 minutes with a GTX 1080 Ti to cache all images!!!
raw_embedding_dims = 50
glove_index = get_glove_embeddings(raw_embedding_dims)
# save_preprocessed_captions(train_captions, glove_index)  # takes a couple minutes to process
train_captions = retrieve_tokenized_captions_from_file()

top_n_words = 2500
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_n_words, lower=True, oov_token="<unk>",
                                                  filters='\t\n')
tokenizer.fit_on_texts(train_captions)
tokenizer.word_index["<pad>"] = 0  # index 1 is for oov tokens, 0 is for the <pad> token
tokenizer.index_word[0] = "<pad>"

In [9]:
tokenizer.word_index["<no_glove>"]

82

In [7]:
train_sequences = tokenizer.texts_to_sequences(train_captions)
max_sequence_length = max(len(sequence) for sequence in train_sequences)
capped_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences,
                                                                 maxlen=max_sequence_length,
                                                                 padding="post")

In [16]:
no_gloves = [i for i, sequence in enumerate(capped_sequences) if tokenizer.word_index["<no_glove>"] in set(sequence)]

In [19]:
train_img_ids, train_captions = img_ids_and_captions_from_json(PATH)

In [24]:
len([train_captions[i] for i in no_gloves])

10536

In [23]:
len(set(train_img_ids))

118287