# Inport Libraries

In [1]:
import os
import glob
import shutil
import string

import numpy as np
from tqdm import tqdm

from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input

2024-12-23 15:34:18.975018: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734942858.988416  188881 cuda_dnn.cc:8498] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734942858.991979  188881 cuda_blas.cc:1410] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-23 15:34:19.005926: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Configure

In [2]:
input_captions_file = "../data/raw/Flickr8k.token.txt"
input_images_dir = "../data/raw/Flicker8k_Dataset"

input_train_images_file = '../data/raw/Flickr_8k.trainImages.txt'
input_val_images_file = '../data/raw/Flickr_8k.devImages.txt'
input_test_images_file = '../data/raw/Flickr_8k.testImages.txt'

output_captions_file = "../data/processed/captions.txt"
output_vocab_file = "../data/processed/vocab.txt"
output_images_dir = "../data/processed/images"

# Utils

In [3]:
# Clear output directories
def clear_output_dirs():
    if os.path.exists(output_images_dir):
        shutil.rmtree(output_images_dir)
    if os.path.exists(output_captions_file):
        os.remove(output_captions_file)
    if os.path.exists(output_vocab_file):
        os.remove(output_vocab_file)

In [4]:
# Read captions file
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [5]:
# Save caption descriptions to a dictionary
#id_image : ['caption 1', 'caption 2', 'caption 3',' caption 4', 'caption 5']
def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # extract filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

In [6]:
# Preprocessing text
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # add 'startseq' and 'endseq' to each description
            desc.insert(0, 'startseq')
            desc.append('endseq')
            # store as string
            desc_list[i] = ' '.join(desc)

In [7]:
# Get vocabulary of descriptions, just get more than 10 times repeated words
def get_vocab(descriptions):
    vocab = set()
    count_vocab = dict()
    # Count all words
    for desc_list in descriptions.values():
        for desc in desc_list:
            words = desc.split()
            for word in words:
                count_vocab[word] = count_vocab.get(word, 0) + 1
    # Get words that appear more than 10 times
    for word, count in count_vocab.items():
        if count >= 10:
            vocab.add(word)
            
    return vocab

In [8]:
# Save vocab to file
def save_vocab(vocab, filename):
    data = '\n'.join(vocab)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [9]:
# Save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    with open(filename, 'w') as file:
        for key, desc_list in tqdm(descriptions.items()):
            for desc in desc_list:
                file.write(f"{key} {desc}\n")

In [10]:
# Preprocessing images for inception v3 model
def preprocess_image(img):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(img, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    # Convert numpy array to PIL image
    x = image.array_to_img(x[0])
    return x

In [11]:
# Save the processed images
def save_images(images, folder_name):
    for key, image in tqdm(images.items()):
        filename = folder_name + key[len(input_images_dir)+1:]
        image.save(filename)

# Main

In [12]:
# Clean output directories
clear_output_dirs()

# Make output directories if they don't exist
if not os.path.exists(output_images_dir + '/train'):
    os.makedirs(output_images_dir + '/train')
if not os.path.exists(output_images_dir + '/val'):
    os.makedirs(output_images_dir + '/val')
if not os.path.exists(output_images_dir + '/test'):
    os.makedirs(output_images_dir + '/test')

In [13]:
doc = load_doc(input_captions_file)
print(doc[:300])

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the s


In [14]:
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [15]:
descriptions['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [16]:
# clean descriptions
clean_descriptions(descriptions)
descriptions['1000268201_693b08cb0e']

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

In [17]:
# Get and save vocabulary
vocab = get_vocab(descriptions)
save_vocab(vocab, output_vocab_file)

In [18]:
save_descriptions(descriptions, output_captions_file)

100%|██████████| 8092/8092 [00:00<00:00, 584130.32it/s]


In [19]:
img = glob.glob(input_images_dir + '/*.jpg')

In [20]:
# Read the train, validation, test image names in a set
train_images = set(open(input_train_images_file, 'r').read().strip().split('\n'))
val_images = set(open(input_val_images_file, 'r').read().strip().split('\n'))
test_images = set(open(input_test_images_file, 'r').read().strip().split('\n'))

print('Train images: %d' % len(train_images))
print('Validation images: %d' % len(val_images))
print('Test images: %d' % len(test_images))

Train images: 6000
Validation images: 1000
Test images: 1000


In [21]:
train_img = []
val_img = []
test_img = []
for i in img: # img is list of full path names of all images
    if i[len(input_images_dir)+1:] in test_images: # Check if the image belongs to test set
        test_img.append(i) # Add it to the list of test images
    elif i[len(input_images_dir)+1:] in val_images:
        val_img.append(i)
    elif i[len(input_images_dir)+1:] in train_images:
        train_img.append(i)
        
print('Train images: %d' % len(train_img))
print('Validation images: %d' % len(val_img))
print('Test images: %d' % len(test_img))

Train images: 6000
Validation images: 1000
Test images: 1000


In [22]:
# Preprocess the images
train_img = {k: preprocess_image(k) for k in train_img}
val_img = {k: preprocess_image(k) for k in val_img}
test_img = {k: preprocess_image(k) for k in test_img}

In [23]:
# Save the images
save_images(train_img, output_images_dir + '/train/')
save_images(val_img, output_images_dir + '/val/')
save_images(test_img, output_images_dir + '/test/')

100%|██████████| 6000/6000 [00:02<00:00, 2545.38it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2549.15it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2537.48it/s]
