In [1]:
# enable auto reload of modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Model Training

This notebook will be used to load and preprocess the data, as well as train our own captioning model from scratch. If you want to use the model directly, please refer to the `main.py` file.

This notebook was made to be modular, in such a way that you can run only the section that you want. However, some sections may require the output of the previous section to work properly.

## 1 - Loading captions

In [29]:
from src.utils.captions_utils import *

In [15]:
# load the annotation data
df = load_raw_captions_data("./data/captions.csv")

In [16]:
# generate the caption dictionary
captions_dic = generate_captions_dic(df)

In [17]:
# print info about the caption dictionary
n_images = len(captions_dic)
n_captions_per_image = len(next(iter(captions_dic.values())))
n_captions = n_images * n_captions_per_image

print(f"Number of images: {n_images}")
print(f"Number of captions per image: {n_captions_per_image}")
print(f"Total number of captions: {n_captions}")

Number of images: 31783
Number of captions per image: 5
Total number of captions: 158915


In [22]:
# clean the captions by removing any special characters and converting to lower case
captions_dic = clean_captions(captions_dic)

100%|██████████| 31783/31783 [00:00<00:00, 62322.14it/s]


In [23]:
# print tha captions for the first image
print(f"Captions for the first image:")
for cap in next(iter(captions_dic.values())):
    print(cap)

Captions for the first image:
two young guys with shaggy hair look at their hands while hanging out in the yard
two young white males are outside near many bushes
two men in green shirts are standing in a yard
a man in a blue shirt standing in a garden
two friends enjoy time spent together


In [27]:
# build the vocabulary, this will be used later on when training the LSTM model
vocab = build_vocab(captions_dic)

# print the size of the vocabulary
print(f"Vocabulary size: {len(vocab)} words")

100%|██████████| 31783/31783 [00:00<00:00, 168730.30it/s]

Vocabulary size: 18288 words





In [32]:
# save the captions dictionary
save_captions_dic(captions_dic, "./data/")

## 2 - Image feature extraction

For this section, we will be using mutliple pre-trained models to extract features from the images. The models that we will be using are:
<ul>
    <li>ResNet50</li>
    <li>VGG16</li>
    <li>InceptionV3</li>
    <li>Xception</li>
</ul>

In [1]:
from src.encoders.xception import *
from src.utils.image_utils import *
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

In [2]:
# check if GPU is available and enable memory growth
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

GPU is available


In [3]:
# height and width and channels to which the images will be resized
height = 128
width = 128
n_channels = 3

In [5]:
# load the images for the xcepion model using the xception preprocessor
xception_images = load_images_from_folder("./data/images/", image_size=(height, width), preprocess_input=tf.keras.applications.xception.preprocess_input)

100%|██████████| 31783/31783 [03:12<00:00, 165.25it/s]


In [6]:
print(f"Number of images loaded: {len(xception_images)}")

Number of images loaded: 31783


In [7]:
# create the feature extractors
feature_extractor = XceptionFeatureExtractor(input_shape=(height, width, n_channels))

In [10]:
# extract the features from the images
# Do it in batches to avoid memory issues
batch_size = 256
xception_features = []
for i in range(0, len(xception_images), batch_size):
    print(f"Extracting features from images ({i}, {i + batch_size}) out of {len(xception_images)}...")
    batch = xception_images[i:i + batch_size]
    features = feature_extractor.extract_features(batch)
    xception_features.append(features)

Extracting features from images (0, 256) out of 31783...
Extracting features from images (256, 512) out of 31783...
Extracting features from images (512, 768) out of 31783...
Extracting features from images (768, 1024) out of 31783...
Extracting features from images (1024, 1280) out of 31783...
Extracting features from images (1280, 1536) out of 31783...
Extracting features from images (1536, 1792) out of 31783...
Extracting features from images (1792, 2048) out of 31783...
Extracting features from images (2048, 2304) out of 31783...
Extracting features from images (2304, 2560) out of 31783...
Extracting features from images (2560, 2816) out of 31783...
Extracting features from images (2816, 3072) out of 31783...
Extracting features from images (3072, 3328) out of 31783...
Extracting features from images (3328, 3584) out of 31783...
Extracting features from images (3584, 3840) out of 31783...
Extracting features from images (3840, 4096) out of 31783...
Extracting features from images (

In [12]:
# stack the features into a single numpy array
xception_features = np.vstack(xception_features)
print(f"Features shape: {xception_features.shape}")

Features shape: (31783, 4, 4, 2048)


In [14]:
# save the features
save_features(xception_features, "./data/features/xception_features.pkl")