In [None]:
# enable auto reload of modules
%load_ext autoreload
%autoreload 2

# Model Training

This notebook will be used to load and preprocess the data, as well as train our own captioning model from scratch. If you want to use the model directly, please refer to the `main.py` file.

This notebook was made to be modular, in such a way that you can run only the section that you want. However, some sections may require the output of the previous section to work properly.

## 1 - Loading captions

In [None]:
from src.utils.captions_utils import *

In [None]:
# load the annotation data
df = load_raw_captions_data("./data/captions/captions.csv")

In [None]:
# generate the caption dictionary
captions_dic = generate_captions_dic(df)

In [None]:
# print info about the caption dictionary
n_images = len(captions_dic)
n_captions_per_image = len(next(iter(captions_dic.values())))
n_captions = n_images * n_captions_per_image

print(f"Number of images: {n_images}")
print(f"Number of captions per image: {n_captions_per_image}")
print(f"Total number of captions: {n_captions}")

In [None]:
# clean the captions by removing any special characters and converting to lower case
captions_dic = clean_captions(captions_dic)

In [None]:
# print tha captions for the first image
print(f"Captions for the first image:")
for cap in next(iter(captions_dic.values())):
    print(cap)

In [None]:
# save the captions dictionary
save_captions_dic(captions_dic, "./data/captions/")

## 2 - Image feature extraction

For this section, we will be using mutliple pre-trained models to extract features from the images. The models that we will be using are:
<ul>
    <li>ResNet50</li>
    <li>VGG16</li>
    <li>InceptionV3</li>
    <li>Xception</li>
</ul>

In [None]:
from src.architectures.xception import *
from src.utils.image_utils import *
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

In [None]:
# check if GPU is available and enable memory growth
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [None]:
# height and width and channels to which the images will be resized
height = 192
width = 192
n_channels = 3

In [None]:
# load the images for the xcepion model using the xception preprocessor
xception_images = load_images_from_folder_parallel("./data/images/", image_size=(height, width), preprocess_input=tf.keras.applications.xception.preprocess_input)

In [None]:
print(f"Number of images loaded: {len(xception_images)}")

In [None]:
# create the feature extractors
feature_extractor = XceptionFeatureExtractor(input_shape=(height, width, n_channels))

In [None]:
# extract the features from the images
# Do it in batches to avoid memory issues
batch_size = 256
xception_features = []
for i in range(0, len(xception_images), batch_size):
    print(f"Extracting features from images ({i}, {i + batch_size}) out of {len(xception_images)}...")
    batch = xception_images[i:i + batch_size]
    features = feature_extractor.extract_features(batch)
    xception_features.append(features)

In [None]:
# save features into json file
save_features(xception_features, "./data/features/xception_features.npy")

In [None]:
# free memory
del xception_images

## 3 - Caption preprocessing

In [None]:
from src.utils.captions_utils import *

In [None]:
# read the captions dictionary
captions_dic = load_captions_dic("./data/captions/processed_captions.json")

In [None]:
# create the tokenizer
tokenizer = create_tokenizer(captions_dic)

In [None]:
# save the tokenizer
save_tokenizer(tokenizer, "./data/tokenizer/tokenizer.pkl")

## 4 - Model training

In [None]:
from src.utils.data_utils import *
from src.utils.captions_utils import *
from src.utils.image_utils import *
from src.architectures.lstm import *
from src.architectures.xception import *

In [None]:
# get the list of image names
image_names = load_image_names("./data/images/")

In [None]:
# load the required data
captions_dic = load_captions_dic("./data/captions/processed_captions.json")
tokenizer = load_tokenizer("./data/tokenizer/tokenizer.pkl")
features_dic = load_features_as_dic("./data/features/xception_features.npy", filenames=image_names)
max_length = get_max_length(captions_dic)

# height and width and channels to which the images will be resized
height = 192
width = 192
n_channels = 3

feature_extractor = XceptionFeatureExtractor(input_shape=(height, width, n_channels))

In [None]:
# split the data into train and validation sets
data_splitter = DataSplitter(
    captions_dic=captions_dic,
    features_dic=features_dic,
    tokenizer=tokenizer,
    batch_size=64
)

In [None]:
# split the data into train and validation sets
train_data, val_data = data_splitter.split_data(val_split=0.1)

In [None]:
# create the model
model = Decoder(
    input_shape=feature_extractor.output_shape,
    vocab_size=len(tokenizer.word_index) + 1,
    max_length=max_length
)

In [None]:
# train the model
model.fit(
    train_generator=train_data,
    val_generator=val_data,
    epochs=10,
    verbose=1
)

In [None]:
# save the model
model.save("./models/lstm_model1.h5")

## 5 - Model evaluation

In [1]:
from src.utils.data_utils import *
from src.utils.captions_utils import *
from src.utils.image_utils import *
from src.architectures.lstm import *
from src.architectures.xception import *
import tensorflow as tf

In [2]:
# create the xception feature extractor
height = 192
width = 192
n_channels = 3
feature_extractor = XceptionFeatureExtractor(input_shape=(height, width, n_channels))

In [3]:
# load the LSTM model
model = tf.keras.models.load_model("./models/lstm_model1.h5")

In [4]:
# combine the feature extractor and the LSTM model
# Use the functional API to create the model
image_input = feature_extractor.model.input
sequence_input = model.input[1]

image_features = feature_extractor.model(image_input)

caption_output = model([image_features, sequence_input])

# create the model
model = tf.keras.models.Model(inputs=[image_input, sequence_input], outputs=caption_output)

In [5]:
# read the test images
test_images = load_images_from_folder("./data/test/", image_size=(height, width), preprocess_input=tf.keras.applications.xception.preprocess_input)

100%|██████████| 3/3 [00:00<00:00, 122.03it/s]


In [7]:
# run the model on the test images
captions = model.predict([test_images, np.zeros((len(test_images), 78))])



In [8]:
# convert the captions to text from scratch
tokenizer = load_tokenizer("./data/tokenizer/tokenizer.pkl")
captions = convert_captions_to_text(captions, tokenizer)

Loading tokenizer...





In [9]:
# print the captions
print(captions)


