In [1]:
import tensorflow as tf
import os
import json
import pandas as pd
import re
import numpy as np
import time
import matplotlib.pyplot as plt
import collections
import random
import requests
from math import sqrt
from PIL import Image
from tqdm.auto import tqdm
import pickle

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.models import load_model




  from .autonotebook import tqdm as notebook_tqdm


### Image Feature

In [20]:
# load vgg16 model
model = VGG16()
# restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

In [10]:
# extract features from image
features = {}
directory = 'F:\project1\\flickr\\flickr30k_images'

for img_name in tqdm(os.listdir(directory)):
    # load the image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(224, 224))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_name.split('.')[0]
    # store feature
    features[image_id] = feature

  0%|          | 0/31783 [00:00<?, ?it/s]

100%|██████████| 31783/31783 [1:06:32<00:00,  7.96it/s]


In [14]:
# save features
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)

In [15]:
# load features
# with open('features.pkl', 'rb') as f:
#     features = pickle.load(f)

### Preprocessing

In [4]:
with open('F:/project1/flickr/captions3.txt', 'r', encoding="utf-8") as f:
    next(f)
    captions_doc = f.read()

In [5]:
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)


#len(mapping) == 31783

100%|██████████| 158916/158916 [00:00<00:00, 586688.95it/s] 


In [6]:
mapping['1000092795']

['Two young guys with shaggy hair look at their hands while hanging out in the yard .',
 'Two young  White males are outside near many bushes .',
 'Two men in green shirts are standing in a yard .',
 'A man in a blue shirt standing in a garden .',
 'Two friends enjoy time spent together .']

In [7]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [8]:
clean(mapping)

In [9]:
mapping['1000092795']

['startseq two young guys with shaggy hair look at their hands while hanging out in the yard endseq',
 'startseq two young white males are outside near many bushes endseq',
 'startseq two men in green shirts are standing in yard endseq',
 'startseq man in blue shirt standing in garden endseq',
 'startseq two friends enjoy time spent together endseq']

In [10]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

# len(all_captions) == 158915

In [11]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)

# max_length = 74
#vocab_size == 18319

### Data Generator

In [78]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.85)
train = image_ids[:split]
test = image_ids[split:]

In [79]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield {"image": X1, "text": X2}, y
                X1, X2, y = list(), list(), list()
                n = 0

### Generate caption Model

In [13]:
### encoder model
## image feature layers

# image size input (224x224)
# VGG16 model to create feature
inputs1 = Input(shape=(4096,), name="image")  # output for vgg16 is 4096 features
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

## text feature layers
inputs2 = Input(shape=(max_length,), name="text")          # max_length = 74 - max length of a comment in dataset
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)  # vocab_size == 18319
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

### decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')





In [82]:
# train the model
epochs = 10
batch_size = 32
steps = len(train) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)




In [83]:
# Save model
#model.save('vgg_lstm_model.h5')

  saving_api.save_model(


### Test Model

In [25]:
# Load model
loaded_model = load_model("vgg_lstm_model.h5")

# VGG16 model
vgg_model = VGG16()
# restructure the model
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)

In [19]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
      
    return in_text

In [27]:
test_image_path = 'test3.jpg'
#plt.imshow(Image.open(test_image_path))


def predict_caption_mass(test_image_path):
    # load image
    test_image = load_img(test_image_path, target_size=(224, 224))
    # convert image pixels to numpy array
    test_image = img_to_array(test_image)
    # reshape data for model
    test_image = test_image.reshape((1, test_image.shape[0], test_image.shape[1], test_image.shape[2]))
    # preprocess image for vgg
    test_image = preprocess_input(test_image)
    # extract features
    test_feature = vgg_model.predict(test_image, verbose=0)

    # predict from the trained model
    return predict_caption(loaded_model, test_feature, tokenizer, max_length)
    

print(predict_caption_mass(test_image_path))

startseq bird flies through the air endseq


In [19]:
import os

def generate_training_captions(image_folder, caption_folder):
    # Lấy danh sách tất cả các tệp trong thư mục chứa ảnh
    image_files = os.listdir(image_folder)
    # Loop qua từng ảnh
    for image_file in image_files:
        # Xác định đường dẫn đầy đủ của ảnh và tệp caption tương ứng
        image_path = os.path.join(image_folder, image_file)
        caption_path = os.path.join(caption_folder, image_file.split('.')[0] + '.txt')

        # Kiểm tra nếu tệp caption đã tồn tại, nếu có thì bỏ qua
        if os.path.exists(caption_path):
            print(f"Caption for {image_file} already exists. Skipping...")
            continue

        # Thực hiện dự đoán caption cho ảnh
        caption = predict_caption_mass(image_path)

        # Ghi kết quả dự đoán vào tệp caption tương ứng
        with open(caption_path, 'w') as f:
            f.write(caption)
        
        print(f"Caption for {image_file} generated and saved.")

# Sử dụng hàm generate_captions để thực hiện công việc
image_folder = 'F:\project1\subj01\\training_split\\training_images'
caption_folder = 'F:\project1\subj01\\training_split\\training_caption'
generate_training_captions(image_folder, caption_folder)


Caption for train-0001_nsd-00013.png generated and saved.
Caption for train-0002_nsd-00027.png generated and saved.
Caption for train-0003_nsd-00071.png generated and saved.
Caption for train-0004_nsd-00085.png generated and saved.
Caption for train-0005_nsd-00088.png generated and saved.
Caption for train-0006_nsd-00093.png generated and saved.
Caption for train-0007_nsd-00095.png generated and saved.
Caption for train-0008_nsd-00099.png generated and saved.
Caption for train-0009_nsd-00103.png generated and saved.
Caption for train-0010_nsd-00110.png generated and saved.
Caption for train-0011_nsd-00113.png generated and saved.
Caption for train-0012_nsd-00119.png generated and saved.
Caption for train-0013_nsd-00140.png generated and saved.
Caption for train-0014_nsd-00144.png generated and saved.
Caption for train-0015_nsd-00147.png generated and saved.
Caption for train-0016_nsd-00154.png generated and saved.
Caption for train-0017_nsd-00155.png generated and saved.
Caption for tr

In [2]:
# Load model
model = load_model("vgg_lstm_model.h5")
# model là mô hình bạn muốn kiểm tra
input_shape = model.layers[0].input_shape
print("Input shape:", input_shape)
input_dtype = model.layers[0].dtype
print("Input data type:", input_dtype)


Input shape: [(None, 74)]
Input data type: float32
