# Import Libraries

In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm import tqdm
from os import listdir
from keras import Input
from os.path import isfile, join
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.layers import Add, LSTM, Embedding, Dense,Dropout
from tensorflow.keras.preprocessing.image import load_img, img_to_array

2024-12-25 19:59:51.395456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735131591.408914   23552 cuda_dnn.cc:8498] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735131591.412598   23552 cuda_blas.cc:1410] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-25 19:59:51.426116: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Configure

In [2]:
input_train_images = '../data/processed/images/train'
input_val_images = '../data/processed/images/val'
input_test_images = '../data/processed/images/test'
input_captions_file = '../data/processed/captions.txt'
input_vocab_file = '../data/processed/vocab.txt'
input_glove_file= '../data/glove.6B.200d.txt'

embedding_dim = 200

# Utils

In [3]:
# Load caption to dictionary
def load_captions(caption_path):
    captions = {}
    with open(caption_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.split(' ')
            image_name = parts[0]
            caption = ' '.join(parts[1:]
            if image_name in captions:
                captions[image_name].append(caption)
            else:
                captions[image_name] = [caption]
    return captions

In [4]:
# Load all images from a directory
def load_images_and_descriptions(image_dir, captions):
    images = {}
    descriptions = {}
    for f in tqdm(listdir(image_dir)):
        if isfile(join(image_dir, f)):
            img = load_img(join(image_dir, f), target_size=(299, 299))
            img = img_to_array(img)
            img = np.expand_dims(img, axis=0)
            img_name = f.split('.')[0]
            images[img_name] = [img]
            descriptions[img_name] = captions[img_name]
    return images, descriptions

In [5]:
# Load vocabulary
def load_vocab(vocab_path):
    vocab = []
    with open(vocab_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            vocab.append(line.strip())
    return vocab

In [6]:
# Image embedding to vector (2048, )
def encode(image, model):
    fea_vec = model.predict(image, verbose=False) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [7]:
def encode_images(images, model, desc):
    batch_size=100
    encoding = {}
    image_names = list(images.keys())
    image_batches = [
        image_names[i:i + batch_size] for i in range(0, len(image_names), batch_size)
    ]
    
    for batch in tqdm(image_batches, desc=desc):
        # Stack the batch of images and ensure proper shape
        batch_imgs = np.array([np.squeeze(images[img_name]) for img_name in batch])  # Remove extra dimensions
        
        # Predict feature vectors for the batch
        batch_features = model.predict(batch_imgs, verbose=0)
        
        # Store results in the encoding dictionary
        for img_name, fea_vec in zip(batch, batch_features):
            encoding[img_name] = fea_vec
    
    return encoding

In [8]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [9]:
# Calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [10]:
def data_generator(descriptions, photos, wordtoidx, vocab_size, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n = 0
    while True:
        for key, desc_list in descriptions.items():
            photo = photos[key]
            for desc in desc_list:
                seq = [wordtoidx[word] for word in desc.split(' ') if word in wordtoidx]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            n += 1
            if n == num_photos_per_batch:
                yield (np.array(X1), np.array(X2)), np.array(y)
                X1, X2, y = list(), list(), list()
                n = 0

In [11]:
def create_dataset(generator, descriptions, photos, wordtoidx, vocab_size, max_length, num_photos_per_batch):
    def wrapped_generator():
        return generator(descriptions, photos, wordtoidx, vocab_size, max_length, num_photos_per_batch)
    
    # Define the signature of the output
    output_signature = (
        (
            tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),  # Photo features
            tf.TensorSpec(shape=(None, max_length), dtype=tf.int32),  # Input sequences
        ),
        tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32),  # One-hot encoded output
    )
    
    # Create the dataset
    return tf.data.Dataset.from_generator(
        wrapped_generator,
        output_signature=output_signature
    )

In [12]:
def greedySearch(photo, model, wordtoidx, idxtoword, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idxtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

# Main

In [13]:
# Load input data
captions = load_captions(input_captions_file)
vocab = load_vocab(input_vocab_file)
train_img, train_descriptions = load_images_and_descriptions(input_train_images, captions)
val_img, val_descriptions = load_images_and_descriptions(input_val_images, captions)
test_img, test_descriptions = load_images_and_descriptions(input_test_images, captions)

  0%|          | 0/6000 [00:00<?, ?it/s]

100%|██████████| 6000/6000 [00:05<00:00, 1081.23it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1153.45it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1160.56it/s]


In [14]:
# Load the inception v3 model
model = InceptionV3(weights='imagenet')

# Create a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

I0000 00:00:1735131601.061790   23552 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2179 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0                                                   , compute capability: 8.6


In [15]:
# Encode all images
encoded_train = encode_images(train_img, model_new, "Encoding train images")
encoded_val = encode_images(val_img, model_new, "Encoding val images")
encoded_test = encode_images(test_img, model_new, "Encoding test images")

I0000 00:00:1735131604.286631   23775 service.cc:152] XLA service 0x7f79a0255e40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735131604.286654   23775 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-12-25 20:00:04.350658: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1735131604.824272   23775 cuda_dnn.cc:529] Loaded cuDNN version 90600
I0000 00:00:1735131619.803477   23775 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
Encoding train images: 100%|██████████| 60/60 [01:03<00:00,  1.05s/it]
Encoding val images: 100%|██████████| 10/10 [00:06<00:00,  1.54it/s]
Encoding test images: 100%|██████████| 10/10 [00:06<00:00,  1.49it/s]


In [16]:
idxtoword = {}
wordtoidx = {}
for i, word in enumerate(vocab):
    idxtoword[i] = word
    wordtoidx[word] = i

In [17]:
vocab_size = len(vocab) + 1 # one for padding

In [18]:
# Load GloVe model
embeddings_index = {}
with open(input_glove_file, encoding="utf-8") as f:
    embeddings_index = {
        line.split(maxsplit=1)[0]: np.fromstring(line.split(maxsplit=1)[1], sep=" ")
        for line in tqdm(f, desc="Loading GloVe", unit=" lines", total=400000)
    }

Loading GloVe: 100%|██████████| 400000/400000 [00:08<00:00, 45641.29 lines/s]


In [19]:
embeddings_index['the']

array([-7.1549e-02,  9.3459e-02,  2.3738e-02, -9.0339e-02,  5.6123e-02,
        3.2547e-01, -3.9796e-01, -9.2139e-02,  6.1181e-02, -1.8950e-01,
        1.3061e-01,  1.4349e-01,  1.1479e-02,  3.8158e-01,  5.4030e-01,
       -1.4088e-01,  2.4315e-01,  2.3036e-01, -5.5339e-01,  4.8154e-02,
        4.5662e-01,  3.2338e+00,  2.0199e-02,  4.9019e-02, -1.4132e-02,
        7.6017e-02, -1.1527e-01,  2.0060e-01, -7.7657e-02,  2.4328e-01,
        1.6368e-01, -3.4118e-01, -6.6070e-02,  1.0152e-01,  3.8232e-02,
       -1.7668e-01, -8.8153e-01, -3.3895e-01, -3.5481e-02, -5.5095e-01,
       -1.6899e-02, -4.3982e-01,  3.9004e-02,  4.0447e-01, -2.5880e-01,
        6.4594e-01,  2.6641e-01,  2.8009e-01, -2.4625e-02,  6.3302e-01,
       -3.1700e-01,  1.0271e-01,  3.0886e-01,  9.7792e-02, -3.8227e-01,
        8.6552e-02,  4.7075e-02,  2.3511e-01, -3.2127e-01, -2.8538e-01,
        1.6670e-01, -4.9707e-03, -6.2714e-01, -2.4904e-01,  2.9713e-01,
        1.4379e-01, -1.2325e-01, -5.8178e-02, -1.0290e-03, -8.21

In [20]:
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoidx.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [21]:
embedding_matrix.shape

(1950, 200)

In [22]:
# Create model
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length(train_descriptions),))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [23]:
model.summary()

In [24]:
# The second layer of the model is the embedding layer, which is the first layer of the decoder. We will set the weights of this layer to the embedding matrix.
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
model.optimizer.lr = 0.0001
epochs = 10
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [28]:
# Tạo dataset
train_dataset = create_dataset(
    data_generator,
    train_descriptions,
    encoded_train,
    wordtoidx,
    vocab_size,
    max_length(train_descriptions),
    number_pics_per_bath,
)

# Huấn luyện mô hình
model.fit(train_dataset, epochs=epochs, steps_per_epoch=steps, verbose=1)


Epoch 1/10

























[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:24:34[0m 9s/step - loss: 27.0747
























[1m   2/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:02:52[0m 7s/step - loss: 24.2426

























[1m   3/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:02:54[0m 7s/step - loss: 22.0617























[1m   4/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:05:01[0m 8s/step - loss: 20.5434























[1m   5/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:05:10[0m 8s/step - loss: 19.3087























[1m   6/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:03:31[0m 7s/step - loss: 18.2889























[1m   7/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:03:07[0m 7s/step - loss: 17.4388























[1m   8/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:02:56[0m 7s/step - loss: 16.7200






















[1m   9/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:01:43[0m 7s/step - loss: 16.1099
























[1m  10/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:01:00[0m 7s/step - loss: 15.5777






















[1m  11/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:00:22[0m 7s/step - loss: 15.1102



























[1m  12/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:01:39[0m 7s/step - loss: 14.6911
























[1m  14/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:52:06[0m 7s/step - loss: 13.9773






















[1m  15/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:52:26[0m 7s/step - loss: 13.6701


















[1m  16/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:51:56[0m 7s/step - loss: 13.3900






















[1m  18/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:45:22[0m 6s/step - loss: 12.8961





















[1m  19/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:47:22[0m 7s/step - loss: 12.6762























[1m  20/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:48:12[0m 7s/step - loss: 12.4716





















[1m  21/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:48:15[0m 7s/step - loss: 12.2810























KeyboardInterrupt: 

In [None]:
z=5
pic = list(encoded_test.keys())[z]
image = encoded_test[pic].reshape((1,2048))
x=plt.imread(images+pic)
plt.imshow(x)
plt.show()
print(greedySearch(image))