## 1) Imports

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import re
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
from tqdm import tqdm

## 2) Get Images and captions together

In [3]:
# 13 GB of data, get captions and image names in vectors
annotations_file = "D:/DLNLP_Project_Data/annotations/captions_train2014.json"
imgs_path = "D:/DLNLP_Project_Data/train2014/"

In [4]:
with open(annotations_file, 'r') as f:
    annotations = json.load(f)
    
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = imgs_path + 'COCO_train2014_' + '%012d.jpg'%(image_id)
    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)

In [5]:
print("Total number of images: ", len(all_img_name_vector))

Total number of images:  414113


In [6]:
# Shuffle captions and images together
train_captions, img_name_vectors = shuffle(all_captions, all_img_name_vector, random_state = 1)

## 3) Preprocess images with Inception V3
This is used as the encoder that classifies each image. We extract features from the last convolution layer. Convert to 299x299 input format, normalize image pixels in range -1 to 1.

In [7]:
def load_image(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, img_path

In [9]:
# Initialize Inception V3 and pretrained weights from Imagenet training
image_model = tf.keras.applications.InceptionV3(include_top = False, weights = 'imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

# This will download the pretrained weights

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [11]:
# Now cache features extracted from the CNN
# This takes A LOT of time
# Get unique names
encode_train = sorted(set(img_name_vectors))

image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

for img, path in tqdm(image_dataset):
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

104it [11:56,  4.82s/it]

KeyboardInterrupt: 

## 4) Preprocessing captions

In [12]:
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)
# Choose the top 5000 words from the vocabulary
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
train_seqs = tokenizer.texts_to_sequences(train_captions)

# Set word index and index word for 'pad' to 0 and vice versa
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)

# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

# Calculates the max_length, which is used to store the attention weights
max_length = calc_max_length(train_seqs)

In [14]:
# Train test splits using an 80-20 split
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vectors,
                                                                    cap_vector,
                                                                    test_size=0.2,
                                                                    random_state=0)

In [15]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

(331290, 331290, 82823, 82823)

So we train on 331290 images and test on 82823.

## 5) Create dataset for training