In [None]:
import os
import json
import time
import random
import pickle
import collections
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob

from image_captioning.constants import DATA_DIR
from image_captioning.data_pipeline import input_dataset
from image_captioning.model import text_vectorization, encoder, decoder

# Download data

This can take some time as it's ~13GB of data.

In [None]:
# Download caption annotation files
if not os.path.exists(DATA_DIR):
    annotation_zip = tf.keras.utils.get_file(
        fname='captions.zip',
        origin='http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
        cache_subdir=DATA_DIR,
        extract=True
    )
    os.remove(annotation_zip)

annotation_file = os.path.join(DATA_DIR, 'annotations/captions_train2014.json')

In [None]:
# Download image files
image_folder = os.path.join(DATA_DIR, 'train2014')
if not os.path.exists(DATA_DIR):
    image_zip = tf.keras.utils.get_file(
        fname='train2014.zip',
        origin='http://images.cocodataset.org/zips/train2014.zip',
        cache_subdir=image_folder,
        extract=True
    )
    os.remove(image_zip)

# Load data

In [None]:
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [None]:
# Group all captions together having the same image ID.
imgpath_to_caption = collections.defaultdict(list)
for ann in annotations['annotations']:
    caption = f"<start> {ann['caption']} <end>"
    image_path = os.path.join(image_folder, 'COCO_train2014_{:012d}.jpg'.format(ann['image_id']))
    imgpath_to_caption[image_path].append(caption)

image_paths = list(imgpath_to_caption.keys())

In [None]:
# Visualize random image with its captions
fig, ax = plt.subplots(figsize=(8, 8))

random_image_path = np.random.choice(image_paths)
ax.set_title("\n".join(imgpath_to_caption[random_image_path]), fontsize=14)
ax.imshow(plt.imread(random_image_path));

# Data preprocess image

In [None]:
random_image_path = np.random.choice(image_paths)
preprocessed_img, _ = input_dataset.load_and_preprocess_image(random_image_path)

print(preprocessed_img.shape)
plt.imshow((preprocessed_img + 1) / 2);

# Encode images

In [None]:
all_captions = []
all_imgpaths = []

for image_path in image_paths:
    caption_list = imgpath_to_caption[image_path]
    all_captions.extend(caption_list)
    all_imgpaths.extend([image_path] * len(caption_list))  # duplicate image path so that every caption has its own image path

In [None]:
# initialize inceptionV3 network with imagenet weights
inceptionV3 = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

In [None]:
def imgpath_to_featurepath(path):
    filename = os.path.basename(path)
    filename = filename.replace('.jpg', '.npy')
    return os.path.join(DATA_DIR, 'train2014_features', filename)  # store features in separate directory

In [None]:
os.makedirs(os.path.join(DATA_DIR, 'train2014_features'), exist_ok=True)  # directory where to store processed features

# get unique image paths that are not processed yet
img_paths_processed = [p.replace('_features', '').replace('.npy', '.jpg') for p in glob(os.path.join(image_folder + '_features', '*'))]
encode_images_list = sorted(set(all_imgpaths) - set(img_paths_processed))
print(f"Number of images left to process: {len(encode_images_list)}")

if len(encode_images_list) > 0:
    # create dataset that returns images and their corresponding filepaths
    image_dataset = tf.data.Dataset.from_tensor_slices(encode_images_list)
    image_dataset = image_dataset.map(input_dataset.load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    image_dataset = image_dataset.batch(32)

    for batch_imgs, batch_paths in tqdm(image_dataset):

        batch_features = inceptionV3(batch_imgs)  # output shape (bs, 8, 8, 2048)
        batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))  # output shape (bs, 64, 2048); basically flattens the spatial dimension

        for bf, p in zip(batch_features, batch_paths):
            img_path = p.numpy().decode("utf-8")
            feature_path = imgpath_to_featurepath(img_path)
            np.save(feature_path, bf.numpy())

# Preprocess and tokenize captions

In [None]:
max_length = 50  # max word count for a caption
vocabulary_size = 5000  # use the top 5000 words for a vocabulary

In [None]:
%%time

tokenizer_path = os.path.join(DATA_DIR, 'experiment/tokenizer.pkl')

if os.path.exists(tokenizer_path):
    tokenizer = text_vectorization.load_text_vectorizer(tokenizer_path)
else:
    tokenizer = text_vectorization.fit_text_vectorizer(all_captions, text_vectorization.standardize_text, max_length, vocabulary_size)
    text_vectorization.save_text_vectorizer(tokenizer, tokenizer_path)

# Split data into train/test

In [None]:
len(all_captions), len(all_imgpaths)

In [None]:
unique_imgpaths = list(set(all_imgpaths))
np.random.shuffle(unique_imgpaths)

slice_index = int(len(unique_imgpaths) * 0.8)

train_featurepaths = []
train_captions = []
for imgt in unique_imgpaths[:slice_index]:
    
    feature_path = imgpath_to_featurepath(imgt)

    capt_len = len(imgpath_to_caption[imgt])
    train_featurepaths.extend([feature_path] * capt_len)
    train_captions.extend(imgpath_to_caption[imgt])

val_featurepaths = []
val_captions = []
for imgv in unique_imgpaths[slice_index:]:
    
    feature_path = imgpath_to_featurepath(imgv)
    
    capv_len = len(imgpath_to_caption[imgv])
    val_featurepaths.extend([feature_path] * capv_len)
    val_captions.extend(imgpath_to_caption[imgv])

In [None]:
len(train_featurepaths), len(train_captions), len(val_featurepaths), len(val_captions)

# TF dataset for training

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
num_steps = len(train_featurepaths) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [None]:
# Load the numpy files
def map_func(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8'), allow_pickle=True)
    return img_tensor, cap

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((train_featurepaths, train_captions))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda path, text: (path, tokenizer(text)))

dataset = dataset.map(
    lambda path, text: tf.numpy_function(map_func, [path, text], [tf.float32, tf.int64]), 
    num_parallel_calls=tf.data.AUTOTUNE
)

# Shuffle and batch
dataset = dataset \
    .shuffle(BUFFER_SIZE) \
    .batch(BATCH_SIZE) \
    .prefetch(buffer_size=tf.data.AUTOTUNE)

# Model

In [None]:
cnn_encoder = encoder.CNN_Encoder(embedding_dim)
rnn_decoder = decoder.RNN_Decoder(embedding_dim, units, tokenizer.vocabulary_size())

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

# Checkpoint

In [None]:
checkpoint_path = os.path.join(DATA_DIR, 'experiment/checkpoints/train')
ckpt = tf.train.Checkpoint(encoder=cnn_encoder,
                           decoder=rnn_decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    # restoring the latest checkpoint in checkpoint_path
    ckpt.restore(ckpt_manager.latest_checkpoint)

# Training

In [None]:
# Create mappings for words to indices and indicies to words.
word_to_index = tf.keras.layers.StringLookup(mask_token="", vocabulary=tokenizer.get_vocabulary())
index_to_word = tf.keras.layers.StringLookup(mask_token="", vocabulary=tokenizer.get_vocabulary(), invert=True)

In [None]:
# adding this in a separate cell because if you run the training cell many times, the loss_plot array will be reset
loss_plot = []

In [None]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = rnn_decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([word_to_index('<start>')] * target.shape[0], 1)

    with tf.GradientTape() as tape:
        features = cnn_encoder(img_tensor)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = rnn_decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))

    trainable_variables = cnn_encoder.trainable_variables + rnn_decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [None]:
EPOCHS = 20

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    if epoch % 5 == 0:
        ckpt_manager.save()

    print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')