## Importing Necessary Libraries

In [7]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load

from keras.applications.xception import Xception, preprocess_input #Xception - CNN model
from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.layers import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm #Library to see progress of loops
tqdm.pandas()

## Data Preprocessing

In [8]:
# Loading text file into memory
def load_doc(filename):
  #Opening file as read only
  file = open(filename, 'r')
  text = file.read()
  return text

# Get all imgs + captions in a dictionary
def img_captions_dict(filename):
  file = load_doc(filename)
  captions = file.split('\n')
  descriptions = {} # key = img filenames, values = 5 captions
  for caption in captions[:-1]: # excluding last element which may be empty string if file ends with newline
    img, caption = caption.split('\t')
    if img[:-2] not in descriptions: # excluding last 2 chars which are commented caption nos. (#0, #1, ...)
      descriptions[img[:-2]] = [ caption ] # if img filename is not already in dict
    else:
      descriptions[img[:-2]].append(caption) # if img filename is already in dict
    return descriptions

# Data Cleaning: lowercasing, removing punctuation, removing words containing numbers
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation) # creates a translation table mapping where each string.punctuation char is mapped to None (to later remove it during translation)
    # str.maketrans(x=chars to map from, y=chars to map to, z=chars to delete)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ") # not inplace replacement
            desc = img_caption.split() # split currect caption into individual words
            desc = [word.lower() for word in desc] #converts to lowercase
            desc = [word.translate(table) for word in desc] #remove punctuation from each token
            desc = [word for word in desc if(len(word)>1)] #remove hanging 's and a (single char words)
            desc = [word for word in desc if(word.isalpha())] #remove tokens with numbers in them (keeps only words that consist entirely of alphabetic characters)

            img_caption = ' '.join(desc) #join cleaned words back together into a string
            captions[img][i]= img_caption
    return captions

# Build vocabulary of all unique words
def text_vocabulary(descriptions):
    vocab = set() #set to store unique words (not duplicates)
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

# Save all descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

# Set path according to project folder
dataset_text = 'Flickr8k_text'
dataset_images = 'Flicker8k_Dataset'

#Prepare text data
filename = dataset_text + "/" + "Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = img_captions_dict(filename)
print("Length of descriptions =" ,len(descriptions))

#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

#building vocabulary
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

#saving each description to file
save_descriptions(clean_descriptions, "descriptions.txt")

Length of descriptions = 1
Length of vocabulary =  13


## Extracting Feature Vectors from Images

In [9]:
# Extract Features from Exception Model
def extract_features(directory):
        model = Xception( include_top=False, pooling='avg' )
        #include_top=False excludes final classification layer
        #pooling='avg' means global average pooling will be applied to the output,
        #resulting in a 2048 dimension output vector for each image
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename) #opens image with PIL - Python Imaging Library
            image = image.resize((299,299)) #Reqd input size for Xception model = 299x299
            image = np.expand_dims(image, axis=0) #Adds an extra dimension at axis 0 (batch size dimension) reqd by Keras model
            #image = preprocess_input(image)
            #pixel normalization:
            image = image/127.5 
            image = image - 1.0
            feature = model.predict(image, verbose = 0) #o/p = feature vector of 2048 dimensions
            features[img] = feature #store feature vector in dict with img filename as key
        return features

# #2048 feature vector
features = extract_features(dataset_images)
dump(features, open("features.p","wb"))

100%|██████████| 8091/8091 [11:21<00:00, 11.87it/s]


In [10]:
features = load(open("features.p", "rb")) #loading previously extracted features from file

## Loading Dataset

In [11]:
#load the text file in a string and return list of image names
def load_photos(filename): #filename = path to text file containing list of image names
    file = load_doc(filename) #read file contents
    photos = file.split("\n")[:-1] #exclude last empty line
    return photos


def load_clean_descriptions(filename, photos):
    file = load_doc(filename) #read text file with image captions
    descriptions = {}
    for line in file.split("\n"):

        words = line.split()
        if len(words)<1 :
            continue

        image, image_caption = words[0], words[1:]

        if image in photos: #filter captions to only include images in the given photos list
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            #add stard and end tokens to help model learn where captions begin and end
            descriptions[image].append(desc)

    return descriptions #dictionary with img filename as key and list of captions as values


def load_features(photos):
    all_features = load(open("features.p","rb"))
    #filtering only needed features
    features = {k:all_features[k] for k in photos}
    return features


filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"

#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

## Tokenizing Vocabulary

In [12]:
#converting dictionary to clean list of captions 
#i.e. flatten the dict into single list of all captions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#creating tokenizer class
from tensorflow.keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer
#assignes a unique integer to each word in the vocabulary

# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1 #total no. of unique words + 1 for padding (index 0)
vocab_size

#CHECKPOINT
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

14

## Input Output Generator (generate data in batches)

In [13]:
#create input-output sequence pairs from the image description.

#data generator, used by model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield ((input_image, input_sequence), output_word)

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

#You can check the shape of the input and output for your model
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape
#((47, 2048), (47, 32), (47, 7577))

((15, 2048), (15, 14), (15, 16))

## Creating the CNN-RNN Model

In [14]:
from keras.utils import plot_model

# define the captioning model
def define_model(vocab_size, max_length):

    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)

    return model

## Training the Model

In [16]:
# train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

# making a directory models to save our models
#os.mkdir("models")
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".keras")

Dataset:  6000
Descriptions: train= 1
Photos: train= 6000
Vocabulary Size: 16
Description Length:  14


None
You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 2.6971
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 2.6970
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 2.7452
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 2.6196
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 2.6861
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 2.6809
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 2.6627
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 2.6346
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.5699
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 2.5766
