In [1]:
# import modules

import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Embedding, add

In [2]:
# setting up directories

BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'

In [3]:
# Launching the model
model = VGG16()
# modifying the model
model = Model(inputs = model.inputs, outputs = model.layers[-2].output) #removing FCN layers of the architecture
# summarizing the model
model.summary()

2022-07-17 12:43:23.781497: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 12:43:23.885729: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 12:43:23.886469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-17 12:43:23.887555: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     14758

In [4]:
features = {}

In [5]:
# image feature extraction
directory = os.path.join(BASE_DIR, 'Images') 
for img_name in tqdm(os.listdir(directory)):
    # loading a image
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size = (224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose = 0)
    img_id = img_name.split('.')[0]
    features[img_id] = feature

  0%|          | 0/8091 [00:00<?, ?it/s]

2022-07-17 12:43:30.987079: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-07-17 12:43:31.953670: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


In [6]:
# Storing extracted features in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [7]:
# loading data from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [8]:
#loading captions file

with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_file = f.read()

In [9]:
#image captions mapping
captions_mapping = {}

for L in tqdm(captions_file.split('\n')):
    #splitting the image name and captions.
    idcap = L.split(',')
    if(len(L) < 2):
        continue
    image_id, caption = idcap[0], idcap[1]
    #removing image id 
    image_id = image_id.split('.')[0]
    
    caption = ''.join(caption)
    
    if image_id not in captions_mapping:
        captions_mapping[image_id] = []
    captions_mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [10]:
print(len(captions_mapping), len(features))

8091 8091


In [11]:
# Preprocessing the captions data
def clean(captions_mapping):
    for key, captions in captions_mapping.items():
        for i in range(len(captions)):
            # Taking one caption at a time
            caption = captions[i]
            # Convert to lower case
            caption = caption.lower()
            # Delete digits, special characters etc...
            caption = caption.replace('[^A-Za-z]', '')
            # Delete additional spaces
            caption = caption.replace('\s+', ' ')
            # Adding start and end tags to the caption
            caption = '<start> ' + ' '.join([word for word in caption.split() if len(word) > 1]) + ' <end>'
            captions[i] = caption

In [12]:
# Before preprocessing the captions
captions_mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [13]:
# After preprocessing the captions
clean(captions_mapping)
captions_mapping['1000268201_693b08cb0e']

['<start> child in pink dress is climbing up set of stairs in an entry way <end>',
 '<start> girl going into wooden building <end>',
 '<start> little girl climbing into wooden playhouse <end>',
 '<start> little girl climbing the stairs to her playhouse <end>',
 '<start> little girl in pink dress going into wooden cabin <end>']

In [14]:
# Gathering all the captions into a single list
all_captions = []
for key in captions_mapping:
    for caption in captions_mapping[key]:
        all_captions.append(caption)

In [15]:
len(all_captions)

40455

In [16]:
# Counts total no.of words in a array of strings excluding some special characters like punctuation marks
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [17]:
vocab_size

8311

In [18]:
# Get the maximum length of the caption
max_cap_len = max([len(caption.split()) for caption in all_captions])
max_cap_len

31

In [19]:
# Splitting the tarin and test data
img_ids = list(captions_mapping.keys())
split = int(len(img_ids)*0.9)
train_data = img_ids[:split]
test_data = img_ids[split:]

In [20]:
print(len(train_data), len(test_data))

7281 810


In [21]:
#data in batches
def data_batch(data_keys, captions_mapping, features, tokenizer, max_cap_len, vocab_size, batch_size):
    X1, X2, y = list(),  list(), list()
    count = 0
    while 1:
        for key in data_keys:
            count += 1
            captions = captions_mapping[key]
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq = seq[:i]
                    out_seq = seq[i]
                    
                    in_seq = pad_sequences([in_seq], maxlen = max_cap_len)[0]
                    
                    out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
                    
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if(count == batch_size):
                X1 = np.array(X1)
                X2 = np.array(X2)
                y = np.array(y)
                yield [X1,X2], y
                X1, X2, y = list(),  list(), list()
                count = 0

In [22]:
#Model creation

inputs1 = Input(shape = (4096,))
feature1 = Dropout(0.4)(inputs1)
feature2 = Dense(256, activation = 'relu')(feature1)

inputs2 = Input(shape = (max_cap_len,))
sequence1 = Embedding(vocab_size, 256, mask_zero = True)(inputs2)
sequence2 = Dropout(0.4)(sequence1)
sequence3 = LSTM(256)(sequence2)

decoder1 = add([feature2, sequence3])
decoder2 = Dense(256, activation = 'relu')(decoder1)
outputs = Dense(vocab_size, activation = 'softmax')(decoder2)

model = Model(inputs = [inputs1,inputs2], outputs = outputs)
model.compile(loss = 'categorical_crossentropy', optimizer='adam')

plot_model(model, show_shapes = True);

In [23]:
epochs = 15
batch_size = 64
steps = len(train_data)// batch_size

for i in range(epochs):
    generator = data_batch(train_data, captions_mapping, features, tokenizer, max_cap_len, vocab_size, batch_size)
    model.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1)
    



In [24]:
model.save(WORKING_DIR+'/Model.h5')

