In [1]:
# Cell 1: Import thư viện
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
import matplotlib.pyplot as plt

In [2]:
import pickle

with open('features.pkl', 'rb') as f:
    all_features = pickle.load(f)

# Loại bỏ đuôi .jpg để key là ID
features = {k.split('.')[0]: v for k, v in all_features.items()}

print(f"Số lượng features: {len(features)}")
print("5 key đầu tiên trong features:", list(features.keys())[:5])
print("Shape của feature đầu tiên:", list(features.values())[0].shape)

Số lượng features: 8091
5 key đầu tiên trong features: ['378453580_21d688748e', '379006645_b9a2886b51', '380034515_4fbdfa6b26', '380041023_0dfd712ef1', '380515798_c2abbf46b0']
Shape của feature đầu tiên: (1, 2048)


In [3]:
def load_descriptions(doc):
    descriptions = {}
    lines = doc.strip().split('\n')
    for line in lines:
        line = line.strip()
        if not line or line.lower().startswith("image"):
            continue
        try:
            image_id, image_desc = line.split(maxsplit=1)
            image_id = image_id.split('.')[0]
            caption = f"startseq {image_desc} endseq"
            descriptions.setdefault(image_id, []).append(caption)
        except ValueError:
            continue
    return descriptions

with open('captions.txt', 'r') as f:
    doc = f.read()
    descriptions = load_descriptions(doc)

print(f"Số lượng descriptions: {len(descriptions)}")
print("5 key đầu tiên trong descriptions:", list(descriptions.keys())[:5])
print("5 caption đầu tiên của ảnh đầu tiên:", list(descriptions.values())[0][:5])

Số lượng descriptions: 8091
5 key đầu tiên trong descriptions: ['1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8', '1003163366_44323f5815', '1007129816_e794419615']
5 caption đầu tiên của ảnh đầu tiên: ['startseq child in a pink dress is climbing up a set of stairs in an entry way . endseq', 'startseq girl going into a wooden building . endseq', 'startseq little girl climbing into a wooden playhouse . endseq', 'startseq little girl climbing the stairs to her playhouse . endseq', 'startseq little girl in a pink dress going into a wooden cabin . endseq']


In [4]:
features = {k: features[k] for k in descriptions.keys() if k in features}
print(f"Số lượng ảnh có cả features và descriptions: {len(features)}")
print("5 key đầu tiên sau khi lọc:", list(features.keys())[:5])

Số lượng ảnh có cả features và descriptions: 8091
5 key đầu tiên sau khi lọc: ['1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8', '1003163366_44323f5815', '1007129816_e794419615']


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

all_captions = [caption for captions_list in descriptions.values() for caption in captions_list]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

print(f"Vocabulary size: {len(tokenizer.word_index) + 1}")
print("5 từ đầu tiên trong tokenizer:", list(tokenizer.word_index.items())[:5])

Vocabulary size: 8415
5 từ đầu tiên trong tokenizer: [('startseq', 1), ('endseq', 2), ('a', 3), ('in', 4), ('the', 5)]


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_len, descriptions, features):
    X1, X2, y = [], [], []
    for img_id, caption_list in descriptions.items():
        if img_id not in features:
            continue
        feature = features[img_id]
        for desc in caption_list:
            seq = tokenizer.texts_to_sequences([desc])[0]
            if len(seq) < 2:
                continue
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, descriptions, features)
print(f"X1 shape: {X1.shape}")
print(f"X2 shape: {X2.shape}")
print(f"y shape: {y.shape}")

print("5 feature đầu tiên (X1):", X1[:5])
print("5 sequence đầu tiên (X2):", X2[:5])
print("5 nhãn đầu tiên (y):", y[:5])

X1 shape: (437606, 1, 2048)
X2 shape: (437606, 39)
y shape: (437606,)
5 feature đầu tiên (X1): [[[0.1227762  0.33293068 0.75272447 ... 0.21941456 0.30208504 0.40279704]]

 [[0.1227762  0.33293068 0.75272447 ... 0.21941456 0.30208504 0.40279704]]

 [[0.1227762  0.33293068 0.75272447 ... 0.21941456 0.30208504 0.40279704]]

 [[0.1227762  0.33293068 0.75272447 ... 0.21941456 0.30208504 0.40279704]]

 [[0.1227762  0.33293068 0.75272447 ... 0.21941456 0.30208504 0.40279704]]]
5 sequence đầu tiên (X2): [[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  1 42]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  1 42  4]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  1 42  4  3]
 [ 0  0  0 

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

# Tham số đầu vào
vocab_size = 8415   # tổng số từ trong tokenizer + 1
max_length = 39     # chiều dài chuỗi đầu vào (caption) đã padding
embedding_dim = 256 # chiều không gian embedding
feature_dim = 2048  # đầu ra của ResNet50, Xception, etc.

# Đầu vào 1: đặc trưng ảnh (shape: 1 x 2048)
inputs1 = Input(shape=(1, feature_dim))
x1 = Dense(256, activation='relu')(inputs1)
x1 = Dropout(0.5)(x1)
x1 = tf.keras.layers.Reshape((256,))(x1)  # bỏ chiều (1,) để ghép được

# Đầu vào 2: dãy từ (shape: max_length,)
inputs2 = Input(shape=(max_length,))
x2 = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(inputs2)
x2 = Dropout(0.5)(x2)
x2 = LSTM(256)(x2)

# Kết hợp 2 nhánh
decoder = add([x1, x2])
decoder = Dense(256, activation='relu')(decoder)
outputs = Dense(vocab_size, activation='softmax')(decoder)

# Tạo model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()


In [14]:
def data_generator(descriptions, features, tokenizer, max_length, batch_size):
    while True:
        X1, X2, y = list(), list(), list()
        n = 0
        for key, desc_list in descriptions.items():
            for desc in desc_list:
                # encode ảnh
                seq = tokenizer.texts_to_sequences([desc])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1.append(features[key])
                    X2.append(in_seq)
                    y.append(out_seq)
                    n += 1
                    if n == batch_size:
                        yield [np.array(X1), np.array(X2)], np.array(y)
                        X1, X2, y = list(), list(), list()
                        n = 0


In [17]:
import tensorflow as tf

# Định nghĩa output signature đúng cách
output_signature = (
    (
        tf.TensorSpec(shape=(None, 1, 2048), dtype=tf.float32),  # image feature
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)  # input sequence
    ),
    tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)  # output word (one-hot)
)

# Tạo dataset
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(descriptions, features, tokenizer, max_length, batch_size),
    output_signature=output_signature
)

In [18]:
dataset = dataset.prefetch(tf.data.AUTOTUNE)
dataset = dataset.cache()

batch_size = 64  # Đặt batch size tại đây
steps = len(y) // batch_size

# Train model
model.fit(
    dataset,
    steps_per_epoch=steps,
    epochs=10,
    verbose=1
)

Epoch 1/10


ValueError: Creating variables on a non-first call to a function decorated with tf.function.

In [None]:
# Cell 1: Tạo thư mục Result
import os

# Tạo đường dẫn đầy đủ
result_dir = r"D:\CaptionImage\Result"
os.makedirs(result_dir, exist_ok=True)
print(f"Created directory: {result_dir}")

In [None]:
# Cell 2: Lưu model và tokenizer
print("Saving model...")
model_path = os.path.join(result_dir, 'image_captioning_model')
model.save(model_path)
print(f"Model saved to: {model_path}")

print("Saving tokenizer...")
tokenizer_path = os.path.join(result_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved to: {tokenizer_path}")

# Lưu thêm max_length để sử dụng sau này
max_length_path = os.path.join(result_dir, 'max_length.txt')
with open(max_length_path, 'w') as f:
    f.write(str(max_length))
print(f"Max length saved to: {max_length_path}")

In [None]:
!pip install sacrebleu

In [None]:
# Cell 1: Import thêm thư viện cần thiết
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import sacrebleu

# Cell 2: Hàm đánh giá BLEU score
def evaluate_caption(generated, reference):
    # Chuẩn cho nltk
    reference_tokens = [reference.strip().lower().split()]
    candidate_tokens = generated.strip().lower().split()

    bleu_nltk = sentence_bleu(reference_tokens, candidate_tokens,
                             smoothing_function=SmoothingFunction().method1)

    # Chuẩn cho sacrebleu
    bleu_sacre = sacrebleu.sentence_bleu(generated, [reference.strip().lower()])

    return bleu_nltk, bleu_sacre.score

# Cell 3: Load model và các thành phần cần thiết
def load_model_and_components():
    result_dir = r"D:\CaptionImage\Result"
    
    # Load model
    model_path = os.path.join(result_dir, 'image_captioning_model')
    print(f"Loading model from: {model_path}")
    model = tf.keras.models.load_model(model_path)
    
    # Load tokenizer
    tokenizer_path = os.path.join(result_dir, 'tokenizer.pkl')
    print(f"Loading tokenizer from: {tokenizer_path}")
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)
    
    # Load max_length
    max_length_path = os.path.join(result_dir, 'max_length.txt')
    print(f"Loading max_length from: {max_length_path}")
    with open(max_length_path, 'r') as f:
        max_length = int(f.read())
    
    return model, tokenizer, max_length

# Cell 4: Hàm trích xuất đặc trưng ảnh
def extract_features(image_path):
    # Load InceptionV3 model
    model = InceptionV3(weights='imagenet')
    model = tf.keras.Model(model.input, model.layers[-2].output)
    
    # Load and preprocess image
    image = load_img(image_path, target_size=(299, 299))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    
    # Extract features
    features = model.predict(image, verbose=0)
    return features

# Cell 5: Hàm tạo caption
def generate_caption(image_path, model, tokenizer, max_length):
    # Extract features
    features = extract_features(image_path)
    
    # Initialize caption
    caption = 'startseq'
    
    # Generate caption
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([caption])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, '')
        if word == 'endseq':
            break
        caption += ' ' + word
    
    return caption.replace('startseq ', '')

# Cell 6: Chạy pipeline đánh giá
def run_evaluation_pipeline():
    # Load model và components
    model, tokenizer, max_length = load_model_and_components()
    
    # Đường dẫn ảnh và caption mẫu
    img_path = 'Test_image.png'
    reference_caption = "some children are riding on a mini orange train"
    
    # Tạo caption
    caption = generate_caption(img_path, model, tokenizer, max_length)
    
    # In kết quả
    print("📷 Ảnh test:", img_path)
    print("✍️  Caption mẫu (đề bài):", reference_caption)
    print("🤖 Caption từ model:", caption)
    
    # Đánh giá BLEU score
    bleu_nltk, bleu_sacre = evaluate_caption(caption, reference_caption)
    print("\n🔵 BLEU (nltk):", round(bleu_nltk, 4))
    print("🟢 BLEU (sacrebleu):", round(bleu_sacre, 4))
    
    # Hiển thị ảnh với caption
    img = load_img(img_path)
    plt.figure(figsize=(10, 8))
    plt.imshow(img)
    plt.axis('off')
    plt.title(f"Generated: {caption}\nReference: {reference_caption}")
    plt.show()


In [None]:
# Cell 7: Chạy đánh giá
if __name__ == "__main__":
    run_evaluation_pipeline()
