In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import VGG16, InceptionV3
from tensorflow.keras.optimizers import Adam
import os
import string
import nltk
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image


In [7]:
# Set paths for images and captions
image_dir = 'D:/archive/Images'  # Replace with your image directory path
caption_file = 'D:/archive/captions.txt'  # Replace with your captions file path

# Load captions
def load_captions(filename):
    with open(filename, 'r') as f:
        text = f.read()
    captions = {}
    for line in text.split('\n'):
        if len(line) < 1:
            continue
        image_id, caption = line.split("\t")
        image_id = image_id.split("#")[0]
        caption = caption.translate(str.maketrans('', '', string.punctuation)).lower()
        captions.setdefault(image_id, []).append(caption)
    return captions

captions = load_captions(caption_file)


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# Load the pre-trained model for feature extraction (e.g., InceptionV3 or VGG16)
def load_feature_extractor():
    model = InceptionV3(weights='imagenet')
    model = Model(inputs=model.input, outputs=model.layers[-2].output)  # Remove the last layer
    return model

# Extract and save image features
feature_extractor = load_feature_extractor()

def extract_features(directory):
    features = {}
    for img_name in tqdm(os.listdir(directory)):
        if img_name.endswith('.jpg'):
            img_path = os.path.join(directory, img_name)
            image = Image.open(img_path).resize((299, 299))
            image = np.array(image) / 255.0
            image = np.expand_dims(image, axis=0)
            feature = feature_extractor.predict(image)
            features[img_name.split('.')[0]] = feature
    return features

features = extract_features(image_dir)


In [None]:
# Tokenize captions
all_captions = []
for key in captions.keys():
    [all_captions.append(caption) for caption in captions[key]]

# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Convert captions to sequences
def create_sequences(tokenizer, max_length, captions, features):
    X1, X2, y = [], [], []
    for key, cap_list in captions.items():
        feature = features[key][0]
        for caption in cap_list:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

max_length = max(len(caption.split()) for caption in all_captions)
X1, X2, y = create_sequences(tokenizer, max_length, captions, features)


In [None]:
# Define the model
def define_model(vocab_size, max_length):
    # Image feature extractor
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # Sequence processor
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    # Decoder (Combining features and sequence)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

model = define_model(vocab_size, max_length)
model.summary()


In [None]:
# Train the model
model.fit([X1, X2], y, epochs=3, batch_size=64, verbose=1)


In [None]:
# Generate a caption for an image
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Test with a new image
img_path = 'path_to_test_image.jpg'
image = Image.open(img_path).resize((299, 299))
image = np.expand_dims(np.array(image) / 255.0, axis=0)
feature = feature_extractor.predict(image)
caption = generate_caption(model, tokenizer, feature, max_length)
print("Generated Caption:", caption)


In [None]:
def evaluate_model(model, captions, features, tokenizer, max_length):
    actual, predicted = [], []
    for key, cap_list in captions.items():
        y_pred = generate_caption(model, tokenizer, features[key], max_length)
        references = [caption.split() for caption in cap_list]
        y_pred = y_pred.split()
        actual.append(references)
        predicted.append(y_pred)
    
    # Calculate BLEU scores
    bleu_1 = np.mean([sentence_bleu(ref, pred, weights=(1.0, 0, 0, 0)) for ref, pred in zip(actual, predicted)])
    bleu_2 = np.mean([sentence_bleu(ref, pred, weights=(0.5, 0.5, 0, 0)) for ref, pred in zip(actual, predicted)])
    bleu_3 = np.mean([sentence_bleu(ref, pred, weights=(0.33, 0.33, 0.33, 0)) for ref, pred in zip(actual, predicted)])
    bleu_4 = np.mean([sentence_bleu(ref, pred, weights=(0.25, 0.25, 0.25, 0.25)) for ref, pred in zip(actual, predicted)])
    return bleu_1, bleu_2, bleu_3, bleu_4

# Evaluate BLEU scores
bleu_1, bleu_2, bleu_3, bleu_4 = evaluate_model(model, captions, features, tokenizer, max_length)
print(f"BLEU-1: {bleu_1}")
print(f"BLEU-2: {bleu_2}")
print(f"BLEU-3: {bleu_3}")
print(f"BLEU-4: {bleu_4}")
