Side note: all the cells until the `Pipelining Preprocessing Steps` are similar to the cells in `...phase_1.ipynb` file, as we've worked on preprocessing steps to be used in phase 2 when writing the code for phase 1

# Cells for Google Colab

In [22]:
import os
runningFromColab = False
if 'CGROUP_MEMORY_EVENTS' in os.environ and 'colab' in os.environ['CGROUP_MEMORY_EVENTS']:
  runningFromColab = True

In [23]:
if runningFromColab:
  from google.colab import drive
  drive.mount('/content/drive')

In [24]:
if runningFromColab:
  %cd /content/drive/MyDrive/ColabProjects

In [25]:
if runningFromColab:
  !git clone https://github.com/OdyAsh/nlp-image-captioning.git

In [26]:
if runningFromColab:
  %cd /content/drive/MyDrive/ColabProjects/nlp-image-captioning

In [27]:
if runningFromColab:
  !git pull
  # if it DOES NOT say "Already up to date.", then you need to close this notebook file (i.e., the browser tab) and open it again for it to change 

In [28]:
# if runningFromColab:
#   try:
#     import condacolab
#     condacolab.install()
#   except:
#     !pip install -q condacolab
#     import condacolab
#     condacolab.install()
#     # now restart the kernel

In [29]:
# if runningFromColab:
#   !conda env create -f environment.yml
#   # !conda update conda -y -q
#   # !source /usr/local/etc/profile.d/conda.sh
#   # !conda init 
#   # !conda install -n root _license -y -q
#   # !source activate myenv

In [30]:
# if runningFromColab:
#   import sys
#   sys.path.insert(0, '/usr/local/bin/conda')

# Imports & Global Functions/Variables

In [31]:
from pprint import pprint
from glob import glob
from time import time
import os
import pickle
import regex as re
import string
import nltk
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
#                          Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
# from tensorflow.keras.layers import Bidirectional
# from tensorflow.keras.layers import Add # merge.add
from tensorflow.keras.applications import inception_v3 # inception_v3.preprocess_input
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import preprocessing # preprocessing.image, preprocessing.sequence, preprocessing.text.Tokenizer, preprocessing.sequence.pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import to_categorical

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        content = pickle.load(f)
    return content

def pklForceLoad(path, dtype = 'dict'):
    try:
        content = pklLoad(path)
        return content
    except Exception as e:
        if dtype == 'list':
            pklSave([], path)
            return []
        else:
            pklSave({}, path)
            return {}

# more about naming standards for path components here: https://stackoverflow.com/questions/2235173/what-is-the-naming-standard-for-path-components
def joinPaths(baseDirectory, relativePath):
    return os.path.normpath(os.path.join(baseDirectory, relativePath))

In [34]:
datasetImgsBasePath = 'dataset/Flicker8k_Dataset/'
fullImgsPath = glob(datasetImgsBasePath + '*.jpg')
fullImgsPaths = [os.path.normpath(path) for path in fullImgsPath]
len(fullImgsPaths)

8091

# Data Collection
The dataset is obtained from [here](https://forms.illinois.edu/sec/1713398)

In [35]:
# checking the 5 captions per image
filename = "dataset/Flicker8k_TextFiles/Flickr8k.token.txt"
with open(filename, 'r') as f:
    doc = f.read()
lines = doc.split('\n')
print('first image\'s captions:')
pprint(lines[:5])
print('\nsecond image\'s captions:')
pprint(lines[5:10])
print('\nand so forth...')

first image's captions:
['1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of '
 'stairs in an entry way .',
 '1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg#2\tA little girl climbing into a wooden playhouse '
 '.',
 '1000268201_693b08cb0e.jpg#3\tA little girl climbing the stairs to her '
 'playhouse .',
 '1000268201_693b08cb0e.jpg#4\tA little girl in a pink dress going into a '
 'wooden cabin .']

second image's captions:
['1001773457_577c3a7d70.jpg#0\tA black dog and a spotted dog are fighting',
 '1001773457_577c3a7d70.jpg#1\tA black dog and a tri-colored dog playing with '
 'each other on the road .',
 '1001773457_577c3a7d70.jpg#2\tA black dog and a white dog with brown spots '
 'are staring at each other in the street .',
 '1001773457_577c3a7d70.jpg#3\tTwo dogs of different breeds looking at each '
 'other on the road .',
 '1001773457_577c3a7d70.jpg#4\tTwo dogs on pavement moving toward each other .']

and s

The captions above are for these two images:

<img src="project_media/1000268201_693b08cb0e.jpg" width="100" />

<img src="project_media/1001773457_577c3a7d70.jpg" width="150" />

# Data Cleaning
Includes:
* `imgToCaptions` dictionary
* `cleanCaptions()` to remove stopwords/punctuations
* `createVocab()` to limit vocab size based on word frequency
* `train`, `val`, and `test` `ImgToCaptions` which prepends `startseq` and appends `endseq`

In [36]:
# getting these captions in a dictionary; 
# where the key is the image's name (without .jpg) and the value is a list of 5 captions

imgToCaptions = dict()
for line in lines:
    idAndCaption = re.split("\..+\t", line)
    if len(idAndCaption) < 2:
        continue
    imgId, caption = idAndCaption
    if imgId not in imgToCaptions:
        imgToCaptions[imgId] = list()
    imgToCaptions[imgId].append(caption)
    
imgToCaptions['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [37]:
# removing punctuation using maketrans (i.e., translation table)
# more about maketrans method: https://www.w3schools.com/python/ref_string_maketrans.asp#:~:text=The%20third%20parameter%20in%20the%20mapping%20table%20describes%20characters%20that%20you%20want%20to%20remove%20from%20the%20string%3A

# to do: ASK Dr: We've removed few stopwords in order for generated caption to make sense, is this logical?
#                We've also removed numbers

def cleanCaptions(imgToCaptions, levelOfStopwordsPresence=1):
    table = str.maketrans('', '', string.punctuation) # third argument: removes any character in this list: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for key, descList in imgToCaptions.items():
        # when this for loop is done, all 5 captions of an image will be cleaned
        for i in range(len(descList)):
            desc = descList[i]
            desc = desc.split(' ')
            desc = [word.lower() for word in desc]
            desc = [word.translate(table) for word in desc] # remove punctuation from each token
            stopswordsToRemove = []
            if levelOfStopwordsPresence == 1:
                stopswordsToRemove = ['a', 'an', 'the']
            elif levelOfStopwordsPresence >= 2:
                stopswordsToRemove = set(stopwords.words('english'))
            desc = [word for word in desc if word not in stopswordsToRemove]
            desc = [word for word in desc if word.isalpha()] # remove tokens with numbers in them
            descList[i] =  ' '.join(desc) # store as string

# cleanCaptions(imgToCaptions, levelOfStopwordsPresence=1)
# pklSave(imgToCaptions, 'dataset/pickles/imgToCaptionsSWKept.pickle')
imgToCaptions = pklLoad('dataset/pickles/imgToCaptionsSWKept.pickle')
imgToCaptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

example with stopwords removed:
<br><br>
'little girl climbing stairs playhouse',

<br>
example with only ['a', 'an', 'the'] removed:
<br><br>
'little girl climbing stairs to her playhouse',
<br><br>
from the lack of context seen above, we've decided to keep the rest of the stopwords

In [38]:
# creating vocab of unique words (where each word occured at least freqThreshold number of times)
def createVocab(imgToCaptions, freqThreshold = 10):
    vocab = set()
    for key in imgToCaptions.keys():
        [vocab.update(desc.split()) for desc in imgToCaptions[key]]
    print(f'Original vocabulary (i.e., unique words) size: {len(vocab)}')

    # keeping words that appear at least freqThrehold number of times
    vocabWordFreq = {key: 0 for key in vocab}
    for key, descs in imgToCaptions.items():
        for desc in descs:
            descList = desc.split(' ')
            for word in descList:
                if word != '':
                    vocabWordFreq[word] += 1
    
    vocab = set()
    i = 0
    for word, freq in vocabWordFreq.items():
        if freq >= freqThreshold:
            i += 1
            vocab.add(word)
            vocabWordFreq[word] = freq
    print(f'Vocabulary size after removing less frequent words (< {freqThreshold} words): {len(vocab)}')

    vocabWordFreqRemoved = {word: freq for word, freq in vocabWordFreq.items() if word not in vocabWordFreq}

    return vocab, vocabWordFreq, vocabWordFreqRemoved

def createVocabTxtFiles(vocabWordFreq, vocabWordFreqRemoved, filePrefix="vocabFreqThreshold"):
    with open(f'dataset/{filePrefix}.txt', 'w') as f:
        f.write(str(dict(sorted(vocabWordFreq.items(), key=lambda x: x[1], reverse=True))))
    with open(f'dataset/{filePrefix}Removed.txt', 'w') as f:
        f.write(str(dict(sorted(vocabWordFreqRemoved.items(), key=lambda x: x[1], reverse=True))))

freqThreshold = 5
vocab, vocabWordFreq, vocabWordFreqRemoved = createVocab(imgToCaptions, freqThreshold)
createVocabTxtFiles(vocabWordFreq, vocabWordFreqRemoved, filePrefix=f"vocabFreqThreshold{freqThreshold}")

Original vocabulary (i.e., unique words) size: 8366
Vocabulary size after removing less frequent words (< 5 words): 2945


In [39]:
# function to get filenames of images from a text file (without the extension)
def getImgsIdsList(txtPath):
    with open(txtPath, 'r') as f:
        doc = f.read()
    ImgsIds = []
    for line in doc.split('\n'):
        imgId = line.split('.')[0]
        ImgsIds.append(imgId)
    ImgsIds = [id for id in ImgsIds if id != '']
    return ImgsIds

trainImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.trainImages.txt')
valImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.devImages.txt')
testImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.testImages.txt')
print(f'Train Dataset: {len(trainImgsIds)}')
print(f'Validation Dataset: {len(valImgsIds)}')
print(f'Test Dataset: {len(testImgsIds)}')

Train Dataset: 6000
Validation Dataset: 1000
Test Dataset: 1000


In the code below, we use `startseq` and `endseq` for the following reasons:
* startseq : Will indicate the start of the caption generation process
* endseq : to stop predicting words as soon as it appears

In [40]:
trainImgToCaptions = dict()
valImgToCaptions = dict()
testImgToCaptions = dict()
for imgId in trainImgsIds:
    trainImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
for imgId in valImgsIds:
    valImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
for imgId in testImgsIds:
    testImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
print(f'images in training set: {len(trainImgToCaptions)}\n')
print(f'images in validation set: {len(valImgToCaptions)}\n')
print(f'images in testing set: {len(testImgToCaptions)}\n')
print('example from training set:')
pprint(trainImgToCaptions['2513260012_03d33305cf'])

images in training set: 6000

images in validation set: 1000

images in testing set: 1000

example from training set:
['startseq black dog is running after white dog in snow endseq',
 'startseq black dog chasing brown dog through snow endseq',
 'startseq two dogs chase each other across snowy ground endseq',
 'startseq two dogs play together in snow endseq',
 'startseq two dogs running through low lying body of water endseq']


# Preparing File Paths

In [41]:
trainImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in trainImgToCaptions.keys()]
valImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in valImgToCaptions.keys()]
testImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in testImgToCaptions.keys()]
len(trainImgsPaths), len(valImgsPaths), len(testImgsPaths)

(6000, 1000, 1000)

# Data Pre-Processing
Includes:
* Pre-Processing Images
* Pre-Processing Captions

## Pre-Processing Images
Includes:
* Loading Google's `InceptionV3` model
* Preprocessing the image
* Encoding the image by inputting it to `InceptionV3` to get a `2048` feature vector of the image

In [42]:
# getting the feature vector of each image using the InceptionV3 CNN model created by Google Research
model = InceptionV3(weights='imagenet') # getting the InceptionV3 model trained on imagenet data
model.layers[-1].output

<KerasTensor: shape=(None, 1000) dtype=float32 (created by layer 'predictions')>

In [43]:
modelForFeatureExtraction = Model(model.input, model.layers[-2].output) # removing the last layer (output softmax layer)
modelForFeatureExtraction.layers[-1].output

<KerasTensor: shape=(None, 2048) dtype=float32 (created by layer 'avg_pool')>

In [44]:
# function to preprocess the input image
def preprocess(imgPath):
    pilImg = preprocessing.image.load_img(imgPath, target_size=(299, 299)) # Convert all the images to size 299x299 as expected by the inception v3 model
    x = preprocessing.image.img_to_array(pilImg) # Convert PIL image to numpy array of 3-dimensions
    x = np.expand_dims(x, axis=0) # Add one more dimension; from (299, 299, 3) to (1, 299, 299, 3)
    x = inception_v3.preprocess_input(x) # takes in (batch_size, height, width, channels), returns same dimensions, but does some preprocessing operations, like scaling values to be from -1 to 1
    return x

# function to encode a given image (from its path) into a vector of size (2048, )
def encode(imgPath, modelForFeatureExtraction):
    imgPath = preprocess(imgPath) # preprocess the image
    featureVec = modelForFeatureExtraction.predict(imgPath) # Get the encoding vector for the image
    featureVec = np.reshape(featureVec, featureVec.shape[1]) # reshape from (1, 2048) to (2048, )
    return featureVec

In [45]:
# Call the funtion to encode all the train images (dictionary where an image id --> feature vector of length 2048)
# This will take a while on CPU - Execute this only once (took around 13 minutes on my high-end laptop)
def encodeImgToFeatures(imgsPaths, modelForFeatureExtraction):
    imgToFeatures = dict()
    for imgPath in imgsPaths:
        imgToFeatures[imgPath[len(datasetImgsBasePath):]] = encode(imgPath, modelForFeatureExtraction)
    return imgToFeatures

# trainImgToFeatures = encodeImgToFeatures(trainImgsPaths, modelForFeatureExtraction)
# valImgToFeatures = encodeImgToFeatures(valImgsPaths, modelForFeatureExtraction)
# testImgToFeatures = encodeImgToFeatures(testImgsPaths, modelForFeatureExtraction)
# pklSave(trainImgToFeatures, 'dataset/pickles/trainImgToFeatures.pickle')
# pklSave(valImgToFeatures, 'dataset/pickles/valImgToFeatures.pickle')
# pklSave(testImgToFeatures, 'dataset/pickles/testImgToFeatures.pickle')
trainImgToFeatures = pklLoad('dataset/pickles/trainImgToFeatures.pickle')
valImgToFeatures = pklLoad('dataset/pickles/valImgToFeatures.pickle')
testImgToFeatures = pklLoad('dataset/pickles/testImgToFeatures.pickle')
len(trainImgToFeatures), len(valImgToFeatures), len(testImgToFeatures), trainImgToFeatures['2513260012_03d33305cf.jpg'].shape

(6000, 1000, 1000, (2048,))

## Pre-Processing Captions
Includes:
* `mapIdxAndWord()` to map indices to words and vice versa, where the words are obtained from `createVocab()`
* `maxCaptionLength()` to get the caption with the most amount of words, to be used later to pad input sequences <br> (explained in `Preparing Model Generator` section)

In [46]:
# creating two dictionaries: word to index, and index to word

def mapIdxAndWord(vocab):
    idxToWord = {}
    wordToIdx = {}
    idx = 1
    for word in vocab:
        wordToIdx[word] = idx
        idxToWord[idx] = word
        idx += 1
    return idxToWord, wordToIdx

vocab, _, _ = createVocab(imgToCaptions, freqThreshold=5)
idxToWord, wordToIdx = mapIdxAndWord(vocab)
vocabSize = len(idxToWord) + 1 # one for appended 0's; represents "startseq" (explained in "Preparing Model Generator" section)
vocabSize

Original vocabulary (i.e., unique words) size: 8366
Vocabulary size after removing less frequent words (< 5 words): 2945


2946

In [47]:
# getting the length of the longest caption; as we will later need to encode each word into a fixed sized vector

# convert a dictionary of clean captions to a list of captions
def toCaptionsList(ImgToCaptions):
	captionsList = list()
	for imgId in ImgToCaptions.keys():
		[captionsList.append(caption) for caption in ImgToCaptions[imgId]]
	return captionsList

# calculate the length of the description with the most words
def maxCaptionLength(ImgToCaptions):
    captions = toCaptionsList(ImgToCaptions)
    captionsLengths = [len(caption.split()) for caption in captions]
    return max(captionsLengths)

# determine the maximum sequence length
maxCapLen = maxCaptionLength(trainImgToCaptions)
print(f'Description Length: {maxCapLen}')

Description Length: 32


In [48]:
maxCaptionLength(testImgToCaptions)

29

Phase 2 to do list:
1. create function that pipelines all above pre-processing steps with parameters that allow for hyperparameter tuning
2. use Glove embedding
3. create function below `createVocabTxtFiles()` cell which visualizes top key-value pairs in `vocabFreqThreshold` text files
4. uncomment and use model training code cell at the end of the notebook

# Preparing Model Generator

In [49]:
# data generator, intended to be used in a call to model.fit_generator()
def dataGenerator(imgToCaptions, imgToFeatures, wordToIdx, vocabSize, maxCaptionLength, imgsBatchSize):
    X1, X2, y = list(), list(), list()
    n = 0
    # loop forever over images
    while True:
        for imgId, captions in imgToCaptions.items():
            n += 1
            imgFeatures = imgToFeatures[imgId+'.jpg'] # retrieve the image's feature vector
            for caption in captions:
                seq = [wordToIdx[word] for word in caption.split(' ') if word in wordToIdx] # encode the caption into a sequence of numbers instead of words
                for i in range(1, len(seq)): # split one sequence into multiple X, y pairs
                    inSeq, outSeq = seq[:i], seq[i] # split into input and output pair
                    inSeq = preprocessing.sequence.pad_sequences([inSeq], maxlen=maxCaptionLength, padding="pre")[0] # pad input sequence
                    outSeq = to_categorical([outSeq], num_classes=vocabSize)[0] # (one-hot) encodes the output sequence (note: to_categorical() is a keras-related function)
                    X1.append(imgFeatures) # store the values
                    X2.append(inSeq)
                    y.append(outSeq)
            # yield the batch data
            if n == imgsBatchSize:
                yield [[np.array(X1), np.array(X2)], np.array(y)] # "yield" saves the function's state, returns [[..]], then continues function from that statement when function is called again
                X1, X2, y = list(), list(), list()
                n = 0

Explanation of inner-most for loop above:

<img src="project_media/xi_as_words.png" width="600" />

However, since we're using `wordToIdx` mapping, the table above will be:

<img src="project_media/xi_as_idxss.png" width="600" />

Note 1: the table above is for the case of `post` padding. However, we'll assume `pre` padding, as it is [generally advised](https://stackoverflow.com/questions/46298793/how-does-choosing-between-pre-and-post-zero-padding-of-sequences-impact-results#:~:text=I%20always%20recommend%20using%20pre%2Dpadding%20over%20post%2Dpadding%2C%20even%20for%20CNNs%2C%20unless%20the%20problem%20specifically%20requires%20post%2Dpadding.)

Note 2: under the hood, `target word` is one-hot encoded representation of the numerical values displayed in the table above; this is because one-hot encoding the target word allows us to represent this probability distribution as a vector of probabilities over the entire vocabulary, which can be directly compared to the predicted probability distribution output by the neural network  


# Pipelining Preprocessing Steps

In [52]:
def preprocessing_pipeline(freq_threshold, model_for_feature_extraction, load_features=False):

    filename = "dataset/Flicker8k_TextFiles/Flickr8k.token.txt"
    with open(filename, 'r') as f:
        doc = f.read()
    lines = doc.split('\n')

    # getting these captions in a dictionary; 
    # where the key is the image's name (without .jpg) and the value is a list of 5 captions
    imgToCaptions = dict()
    for line in lines:
        idAndCaption = re.split("\..+\t", line)
        if len(idAndCaption) < 2:
            continue
        imgId, caption = idAndCaption
        if imgId not in imgToCaptions:
            imgToCaptions[imgId] = list()
        imgToCaptions[imgId].append(caption)

    freqThreshold = freq_threshold
    vocab, vocabWordFreq, vocabWordFreqRemoved = createVocab(imgToCaptions, freqThreshold)
    # createVocabTxtFiles(vocabWordFreq, vocabWordFreqRemoved, filePrefix=f"vocabFreqThreshold{freqThreshold}")
    idxToWord, wordToIdx = mapIdxAndWord(vocab)
    vocabSize = len(idxToWord) + 1 # one for appended 0's; represents "startseq" (explained in "Preparing Model Generator" section)

    trainImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.trainImages.txt')
    valImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.devImages.txt')
    testImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.testImages.txt')

    trainImgToCaptions = dict()
    valImgToCaptions = dict()
    testImgToCaptions = dict()
    for imgId in trainImgsIds:
        trainImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
    for imgId in valImgsIds:
        valImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
    for imgId in testImgsIds:
        testImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]

    datasetImgsBasePath = 'dataset/Flicker8k_Dataset/'
    fullImgsPath = glob(datasetImgsBasePath + '*.jpg')
    trainImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in trainImgToCaptions.keys()]
    valImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in valImgToCaptions.keys()]
    testImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in testImgToCaptions.keys()]

    if load_features:
        trainImgToFeatures = pklLoad('dataset/pickles/trainImgToFeatures.pickle')
        valImgToFeatures = pklLoad('dataset/pickles/valImgToFeatures.pickle')
        testImgToFeatures = pklLoad('dataset/pickles/testImgToFeatures.pickle')   
    else: 
        trainImgToFeatures = encodeImgToFeatures(trainImgsPaths, model_for_feature_extraction) # shape of each encoded image: (2048,); returned by Google's Inception Model
        valImgToFeatures = encodeImgToFeatures(valImgsPaths, model_for_feature_extraction)
        testImgToFeatures = encodeImgToFeatures(testImgsPaths, model_for_feature_extraction)
        # pklSave(trainImgToFeatures, 'dataset/pickles/trainImgToFeatures.pickle')
    # pklSave(valImgToFeatures, 'dataset/pickles/valImgToFeatures.pickle')
    # pklSave(testImgToFeatures, 'dataset/pickles/testImgToFeatures.pickle')

    # determine the maximum sequence length
    maxCapLen = maxCaptionLength(trainImgToCaptions)

    return (imgToCaptions, vocab, vocabWordFreq, vocabWordFreqRemoved, 
            idxToWord, wordToIdx, vocabSize, maxCapLen,
            trainImgsIds, valImgsIds, testImgsIds, 
            trainImgToFeatures, valImgToFeatures, testImgToFeatures, 
            trainImgToCaptions, valImgToCaptions, testImgToCaptions)

# Word Embedding

## Using Glove Embedding

# Model Training

In [54]:
# hyperparameters to tune
freq_threshold = 5
model = InceptionV3(weights='imagenet') # getting the InceptionV3 model trained on imagenet data
model_for_feature_extraction = Model(model.input, model.layers[-2].output) # removing the last layer (output softmax layer)
imgs_batch_size = 32
epochs = 10

(imgToCaptions, vocab, vocabWordFreq, vocabWordFreqRemoved, 
idxToWord, wordToIdx, vocabSize, maxCapLen,
trainImgsIds, valImgsIds, testImgsIds, 
trainImgToFeatures, valImgToFeatures, testImgToFeatures, 
trainImgToCaptions, valImgToCaptions, testImgToCaptions) = preprocessing_pipeline(freq_threshold, model_for_feature_extraction, load_features=False) # set load_features=True if you won't change "weights" argument of "model" variable to avoid bottleneck of using Google's Inception Model

train_datagen = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
val_datagen = dataGenerator(valImgToCaptions, valImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
test_datagen = dataGenerator(testImgToCaptions, testImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)

Original vocabulary (i.e., unique words) size: 9630
Vocabulary size after removing less frequent words (< 5 words): 3107


In [None]:
steps = len(trainImgToCaptions) // imgs_batch_size
for i in range(1, epochs+1):
    train_datagen = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
    model.fit(train_datagen, epochs=epochs, steps_per_epoch=steps, verbose=2)
    model.save(f'models/changingEpochs/modelWith{str(i)}Epochs.h5') # change file name to include other hyperparameters as well

In [19]:
# to do (phase 2):
# code for embedding
# code for model architecture
# code for model training; provided here: vvv

# epochs = 10
# numImgsPerBatch = 3
# steps = len(trainImgToCaptions)//numImgsPerBatch
# for i in range(1, epochs+1):
#     generator = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, maxCapLen, numImgsPerBatch)
#     model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
#     model.save(f'models/changingEpochs/modelWith{str(i)}Epochs.h5')

# code for validating/testing (inference)