In [2]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, RepeatVector
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
import cv2
import matplotlib.pyplot as pt



In [30]:
# Define the function to load images and captions
def load_data(image_folder, caption_file):
    image_paths = []
    captions = []
    imgs = []

    # Read captions from the text file
    with open(caption_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                img_name, caption = parts
                img_path = os.path.join(image_folder, img_name)
                if os.path.exists(img_path):
                    image_paths.append(img_path)
                    captions.append(caption)
                    
                    # Load and convert the image to numpy array
                    img = Image.open(img_path).convert("RGB")
                    imgs.append(np.array(img))
                    
    return imgs, image_paths, captions

In [31]:
image_folder = r'D:\archive\Images'  # Folder containing images
caption_file = r"D:\archive\captions.txt" # Text file with image-caption pairs
imgs, image_paths, captions = load_data(image_folder, caption_file)

In [32]:
# Display a few images with captions
num_images = 5  # Adjust this to show more or fewer images
num_images_to_display = min(num_images, len(imgs))  # Make sure we don't exceed the available images

# Set up the plot
pt.figure(figsize=(10, 10))

# Loop through the images and display them
for i in range(num_images_to_display):
    pt.subplot(1, num_images_to_display, i + 1)
    pt.imshow(imgs[i])
    pt.title(captions[i])
    pt.axis('off')

# Show the plot after all images have been added
pt.show()

<Figure size 1000x1000 with 0 Axes>

In [33]:
image_paths = image_paths[1:12]

In [34]:
captions = captions[1:12]

In [35]:
cnn_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
cnn_model = Model(inputs=cnn_model.inputs, outputs=cnn_model.layers[-1].output)

In [36]:
import matplotlib.pyplot as plt

def extract_features(image):
    image = np.expand_dims(image, axis=0)  # Expand dims to simulate batch
    return cnn_model.predict(image)

In [37]:
def build_captioning_model(vocab_size, max_caption_length):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=256, input_length=max_caption_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [None]:
vocab_size = 1000           # Vocab size for captions
max_caption_length = 10      # Max length of captions

# Build and summarize the model
captioning_model = build_captioning_model(vocab_size, max_caption_length)
captioning_model.summary()

# Data for demonstration
image = np.random.rand(224, 224, 3)  # Dummy image data
caption = np.random.randint(1, vocab_size, (1, max_caption_length))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 256)           256000    
                                                                 
 lstm_1 (LSTM)               (None, 10, 256)           525312    
                                                                 
 dense_1 (Dense)             (None, 10, 1000)          257000    
                                                                 
Total params: 1038312 (3.96 MB)
Trainable params: 1038312 (3.96 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
image_features = extract_features(image)
print("Extracted Image Features:", image_features.shape)

Extracted Image Features: (1, 7, 7, 512)


In [40]:
captioning_model.fit(caption, np.random.rand(1, max_caption_length, vocab_size), epochs=5,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2b7082adc60>

In [41]:
reference_captions = [
    ["a", "sample", "caption", "of", "an", "image"],
    ["another", "description", "of", "the", "image", "content"]
]

In [None]:
!pip install nltk

In [42]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu(reference, candidate):
    reference = [reference]  # NLTK BLEU expects list of references
    return sentence_bleu(reference, candidate)

In [43]:
dummy_generated_caption = ["this", "is", "a", "generated", "caption"]
bleu_score = evaluate_bleu(reference_captions[0], dummy_generated_caption)
print("BLEU score for the generated caption:", bleu_score)

BLEU score for the generated caption: 0.6511126026643229
