# Cells for Google Colab

In [1]:
import os
runningFromColab = False
if 'CGROUP_MEMORY_EVENTS' in os.environ and 'colab' in os.environ['CGROUP_MEMORY_EVENTS']:
  runningFromColab = True

In [2]:
if runningFromColab:
  from google.colab import drive
  drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
if runningFromColab:
  %cd /content/drive/MyDrive/ColabProjects

/content/drive/MyDrive/ColabProjects


In [13]:
if runningFromColab:
  !git clone https://github.com/OdyAsh/nlp-image-captioning.git

fatal: destination path 'nlp-image-captioning' already exists and is not an empty directory.


In [4]:
if runningFromColab:
  %cd /content/drive/MyDrive/ColabProjects/nlp-image-captioning

/content/drive/MyDrive/ColabProjects/nlp-image-captioning


In [14]:
if runningFromColab:
  !git pull
  # if it doesn't say "Already up to date.", then you need to close this notebook file (i.e., the browser tab) and open it again for it to change 

Already up to date.


In [10]:
if runningFromColab:
  try:
    import condacolab
    condacolab.install()
  except:
    !pip install -q condacolab
    import condacolab
    condacolab.install()
    # now restart the kernel

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:26
🔁 Restarting kernel...


In [13]:
if runningFromColab:
  !conda env create -f environment.yml
  # !conda update conda -y -q
  # !source /usr/local/etc/profile.d/conda.sh
  # !conda init 
  # !conda install -n root _license -y -q
  # !source activate myenv


CondaValueError: could not parse 'name: .conda' in: environment.yml



In [None]:
if runningFromColab:
  import sys
  sys.path.insert(0, '/usr/local/bin/conda')

# Courese Work 1 Requirements

<img src="project_media/cw1_requirements.png" width="500" />

# Imports & Global Functions/Variables

In [12]:
from pprint import pprint
from glob import glob
from time import time
import os
import pickle
import regex as re
import string
import nltk
from nltk.corpus import stopwords
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Add # merge.add
from tensorflow.keras.applications import inception_v3 # inception_v3.preprocess_input
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import preprocessing # preprocessing.image, preprocessing.sequence, preprocessing.text.Tokenizer, preprocessing.sequence.pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import to_categorical


In [None]:
nltk.download('stopwords')

In [2]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        content = pickle.load(f)
    return content

def pklForceLoad(path, dtype = 'dict'):
    try:
        content = pklLoad(path)
        return content
    except Exception as e:
        if dtype == 'list':
            pklSave([], path)
            return []
        else:
            pklSave({}, path)
            return {}

# more about naming standards for path components here: https://stackoverflow.com/questions/2235173/what-is-the-naming-standard-for-path-components
def joinPaths(baseDirectory, relativePath):
    return os.path.normpath(os.path.join(baseDirectory, relativePath))

In [13]:
datasetImgsBasePath = 'dataset/Flicker8k_Dataset/'
fullImgsPath = glob(datasetImgsBasePath + '*.jpg')
fullImgsPaths = [os.path.normpath(path) for path in fullImgsPath]
len(fullImgsPaths)

8091

# Data Collection
The dataset is obtained from [here](https://forms.illinois.edu/sec/1713398)

In [17]:
# checking the 5 captions per image
filename = "dataset/Flicker8k_TextFiles/Flickr8k.token.txt"
with open(filename, 'r') as f:
    doc = f.read()
lines = doc.split('\n')
print('first image\'s captions:')
pprint(lines[:5])
print('\nsecond image\'s captions:')
pprint(lines[5:10])
print('\nand so forth...')

first image's captions:
['1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of '
 'stairs in an entry way .',
 '1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg#2\tA little girl climbing into a wooden playhouse '
 '.',
 '1000268201_693b08cb0e.jpg#3\tA little girl climbing the stairs to her '
 'playhouse .',
 '1000268201_693b08cb0e.jpg#4\tA little girl in a pink dress going into a '
 'wooden cabin .']

second image's captions:
['1001773457_577c3a7d70.jpg#0\tA black dog and a spotted dog are fighting',
 '1001773457_577c3a7d70.jpg#1\tA black dog and a tri-colored dog playing with '
 'each other on the road .',
 '1001773457_577c3a7d70.jpg#2\tA black dog and a white dog with brown spots '
 'are staring at each other in the street .',
 '1001773457_577c3a7d70.jpg#3\tTwo dogs of different breeds looking at each '
 'other on the road .',
 '1001773457_577c3a7d70.jpg#4\tTwo dogs on pavement moving toward each other .']

and s

The captions above are for these two images:

<img src="project_media/1000268201_693b08cb0e.jpg" width="100" />

<img src="project_media/1001773457_577c3a7d70.jpg" width="150" />

# Data Cleaning

In [18]:
# getting these captions in a dictionary; where the key is the image's name (without .jpg) and the value is a list of 5 captions
imgToCaptions = dict()
for line in lines:
    idAndCaption = re.split("\..+\t", line)
    if len(idAndCaption) < 2:
        continue
    imgId, caption = idAndCaption
    if imgId not in imgToCaptions:
        imgToCaptions[imgId] = list()
    imgToCaptions[imgId].append(caption)
    
imgToCaptions['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [19]:
# removing punctuation using maketrans (i.e., translation table)
# more about maketrans method: https://www.w3schools.com/python/ref_string_maketrans.asp#:~:text=The%20third%20parameter%20in%20the%20mapping%20table%20describes%20characters%20that%20you%20want%20to%20remove%20from%20the%20string%3A

# to do: ASK Dr: should I remove numbers/stopwords for image captioning task?

def cleanCaptions(imgToCaptions, levelOfStopwordsPresence=1):
    table = str.maketrans('', '', string.punctuation) # third argument: removes any character in this list: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for key, descList in imgToCaptions.items():
        # when this for loop is done, all 5 captions of an image will be cleaned
        for i in range(len(descList)):
            desc = descList[i]
            desc = desc.split(' ')
            desc = [word.lower() for word in desc]
            desc = [word.translate(table) for word in desc] # remove punctuation from each token
            stopswordsToRemove = []
            if levelOfStopwordsPresence == 1:
                stopswordsToRemove = ['a', 'an', 'the']
            elif levelOfStopwordsPresence >= 2:
                stopswordsToRemove = set(stopwords.words('english'))
            desc = [word for word in desc if word not in stopswordsToRemove]
            desc = [word for word in desc if word.isalpha()] # remove tokens with numbers in them
            descList[i] =  ' '.join(desc) # store as string

# cleanCaptions(imgToCaptions, levelOfStopwordsPresence=1)
# pklSave(imgToCaptions, 'dataset/pickles/imgToCaptionsSWKept.pickle')
imgToCaptions = pklLoad('dataset/pickles/imgToCaptionsSWKept.pickle')
imgToCaptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

example with stopwords removed:
<br><br>
'little girl climbing stairs playhouse',

<br>
example with only ['a', 'an', 'the'] removed:
<br><br>
'little girl climbing stairs to her playhouse',
<br><br>
from the lack of context seen above, we've decided to keep the rest of the stopwords

In [66]:
# creating vocab of unique words (where each word occured at least freqThreshold number of times)
def createVocab(freqThreshold = 10):
    vocab = set()
    for key in imgToCaptions.keys():
        [vocab.update(desc.split()) for desc in imgToCaptions[key]]
    print(f'Original vocabulary (i.e., unique words) size: {len(vocab)}')

    # keeping words that appear at least freqThrehold number of times
    # ASK DR: should i do that? or retain all words? should this be considered a hyperparameter?
    vocabWordFreq = {key: 0 for key in vocab}
    for key, descs in imgToCaptions.items():
        for desc in descs:
            descList = desc.split(' ')
            for word in descList:
                if word != '':
                    vocabWordFreq[word] += 1
    
    vocab = set()
    vocabWordFreqFinal = dict()
    i = 0
    for word, freq in vocabWordFreq.items():
        if freq >= freqThreshold:
            i += 1
            vocab.add(word)
            vocabWordFreqFinal[word] = freq
    print(f'Vocabulary size after removing less frequent words (< {freqThreshold} words): {len(vocab)}')

    vocabWordFreqRemoved = {word: freq for word, freq in vocabWordFreq.items() if word not in vocabWordFreqFinal}

    return vocab, vocabWordFreqFinal, vocabWordFreqRemoved

def createVocabTxtFiles(vocabWordFreqFinal, vocabWordFreqRemoved, filePrefix="vocabFreqThreshold"):
    with open(f'dataset/{filePrefix}Final.txt', 'w') as f:
        f.write(str(dict(sorted(vocabWordFreqFinal.items(), key=lambda x: x[1], reverse=True))))
    with open(f'dataset/{filePrefix}Removed.txt', 'w') as f:
        f.write(str(dict(sorted(vocabWordFreqRemoved.items(), key=lambda x: x[1], reverse=True))))

vocab, vocabWordFreqFinal, vocabWordFreqRemoved = createVocab(freqThreshold=5)
createVocabTxtFiles(vocabWordFreqFinal, vocabWordFreqRemoved, filePrefix=f"vocabFreqThreshold5")

Original vocabulary (i.e., unique words) size: 8366
Vocabulary size after removing less frequent words (< 5 words): 2945


In [37]:
#  function to get filenames of images from a text file (without the extension)
def getImgsIdsList(txtPath):
    with open(txtPath, 'r') as f:
        doc = f.read()
    ImgsIds = []
    for line in doc.split('\n'):
        imgId = line.split('.')[0]
        ImgsIds.append(imgId)
    ImgsIds = [id for id in ImgsIds if id != '']
    return ImgsIds

trainImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.trainImages.txt')
valImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.devImages.txt')
testImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.testImages.txt')
print(f'Train Dataset: {len(trainImgsIds)}')
print(f'Validation Dataset: {len(valImgsIds)}')
print(f'Test Dataset: {len(testImgsIds)}')

Train Dataset: 6000
Validation Dataset: 1000
Test Dataset: 1000


In [48]:
trainImgToCaptions = dict()
valImgToCaptions = dict()
testImgToCaptions = dict()
for imgId in trainImgsIds:
    trainImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
for imgId in valImgsIds:
    valImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
for imgId in testImgsIds:
    testImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
print(f'images in training set: {len(trainImgToCaptions)}\n')
print(f'images in validation set: {len(valImgToCaptions)}\n')
print(f'images in testing set: {len(testImgToCaptions)}\n')
print('example from training set:')
pprint(trainImgToCaptions['2513260012_03d33305cf'])

images in training set: 6000

images in validation set: 1000

images in testing set: 1000

example from training set:
['startseq black dog is running after white dog in snow endseq',
 'startseq black dog chasing brown dog through snow endseq',
 'startseq two dogs chase each other across snowy ground endseq',
 'startseq two dogs play together in snow endseq',
 'startseq two dogs running through low lying body of water endseq']


# Preparing File Paths

In [50]:
trainImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in trainImgToCaptions.keys()]
valImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in valImgToCaptions.keys()]
testImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in testImgToCaptions.keys()]
len(trainImgsPaths), len(valImgsPaths), len(testImgsPaths)

(6000, 1000, 1000)

# Data Pre-Processing

## Pre-Processing Images

In [4]:
# getting the feature vector of each image using the InceptionV3 CNN model created by Google Research
model = InceptionV3(weights='imagenet') # getting the InceptionV3 model trained on imagenet data
model.layers[-1].output

<KerasTensor: shape=(None, 1000) dtype=float32 (created by layer 'predictions')>

In [38]:
modelForFeatureExtraction = Model(model.input, model.layers[-2].output) # removing the last layer (output softmax layer)
modelForFeatureExtraction.layers[-1].output

<KerasTensor: shape=(None, 2048) dtype=float32 (created by layer 'avg_pool')>

In [39]:
# function to preprocess the input image
def preprocess(imgPath):
    pilImg = preprocessing.image.load_img(imgPath, target_size=(299, 299)) # Convert all the images to size 299x299 as expected by the inception v3 model
    x = preprocessing.image.img_to_array(pilImg) # Convert PIL image to numpy array of 3-dimensions
    x = np.expand_dims(x, axis=0) # Add one more dimension; from (299, 299, 3) to (1, 299, 299, 3)
    x = inception_v3.preprocess_input(x) # takes in (batch_size, height, width, channels), returns same dimensions, but does some preprocessing operations, like scaling values to be from -1 to 1
    return x

# function to encode a given image (from its path) into a vector of size (2048, )
def encode(imgPath):
    imgPath = preprocess(imgPath) # preprocess the image
    featureVec = modelForFeatureExtraction.predict(imgPath) # Get the encoding vector for the image
    featureVec = np.reshape(featureVec, featureVec.shape[1]) # reshape from (1, 2048) to (2048, )
    return featureVec

In [53]:
# Call the funtion to encode all the train images (dictionary where an image id --> feature vector of length 2048)
# This will take a while on CPU - Execute this only once (took around 13 minutes on my high-end laptop)
def encodeImgToFeatures(imgsPaths):
    imgToFeatures = dict()
    for imgPath in imgsPaths:
        imgToFeatures[imgPath[len(datasetImgsBasePath):]] = encode(imgPath)
    return imgToFeatures

# trainImgToFeatures = encodeImgToFeatures(trainImgsPaths)
# valImgToFeatures = encodeImgToFeatures(valImgsPaths)
# testImgToFeatures = encodeImgToFeatures(testImgsPaths)
# pklSave(trainImgToFeatures, 'dataset/pickles/trainImgToFeatures.pickle')
# pklSave(valImgToFeatures, 'dataset/pickles/valImgToFeatures.pickle')
# pklSave(testImgToFeatures, 'dataset/pickles/testImgToFeatures.pickle')
trainImgToFeatures = pklLoad('dataset/pickles/trainImgToFeatures.pickle')
valImgToFeatures = pklLoad('dataset/pickles/valImgToFeatures.pickle')
testImgToFeatures = pklLoad('dataset/pickles/testImgToFeatures.pickle')
len(trainImgToFeatures), len(valImgToFeatures), len(testImgToFeatures), trainImgToFeatures['2513260012_03d33305cf.jpg'].shape

(6000, 1000, 1000, (2048,))

## Pre-Processing Captions

In [70]:
# creating two dictionaries: word to index, and index to word

def mapIdxAndWord(vocab):
    idxToWord = {}
    wordToIdx = {}
    idx = 1
    for word in vocab:
        wordToIdx[word] = idx
        idxToWord[idx] = word
        idx += 1
    return idxToWord, wordToIdx

vocab, _, _ = createVocab(freqThreshold=5)
idxToWord, wordToIdx = mapIdxAndWord(vocab)
vocabSize = len(idxToWord) + 1 # one for appended 0's; explained later
vocabSize

Original vocabulary (i.e., unique words) size: 8366
Vocabulary size after removing less frequent words (< 5 words): 2945


2946

In [71]:
# getting the length of the longest caption; as we will later need to encode each word into a fixed sized vector

# convert a dictionary of clean captions to a list of captions
def toCaptionsList(ImgToCaptions):
	captionsList = list()
	for imgId in ImgToCaptions.keys():
		[captionsList.append(caption) for caption in ImgToCaptions[imgId]]
	return captionsList

# calculate the length of the description with the most words
def maxCaptionLength(ImgToCaptions):
    captions = toCaptionsList(ImgToCaptions)
    captionsLengths = [len(caption.split()) for caption in captions]
    return max(captionsLengths)

# determine the maximum sequence length
maxCapLen = maxCaptionLength(trainImgToCaptions)
print(f'Description Length: {maxCapLen}')

Description Length: 32
