In [1]:
import tensorflow 
import os
import pickle
import numpy as np
%pip install tqdm
from tqdm import tqdm

ModuleNotFoundError: No module named 'tensorflow.python'

In [None]:
from tensorflow import keras
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array


In [None]:

from keras.layers import Input, Dense, Dropout, Embedding, LSTM, add
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.utils import to_categorical,plot_model


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\sande\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1


In [None]:
print(os.listdir(path))

['captions.txt', 'features.pkl', 'Images']


Make Model using VGG16

In [None]:
#load Model
model = VGG16()
#Restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
#summarize the model
model.summary()

Feature Extraction Pipeline 

In [None]:
features = {}

# If you want to go inside "Images" or "captions.txt"
images_dir = os.path.join(path, "Images")
captions_file = os.path.join(path, "captions.txt") #that is use in caption generation

for img in tqdm(os.listdir(images_dir)):
    # Only process image files
    if not img.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
        continue
    img_path = images_dir + '/' + img
    #resize image
    image = load_img(img_path, target_size=(224, 224))
    #convert image pixels to array
    image = img_to_array(image)
    #reshape data for modele  
    image = np.expand_dims(image, axis=0)
    #prepare the image for the VGG model
    image = preprocess_input(image)
    #get features
    feature = model.predict(image, verbose=0)
    #get image id
    image_id = img.split('.')[0]
    #store features in dictionary
    features[image_id] = feature

100%|██████████| 8093/8093 [18:50<00:00,  7.16it/s]


In [None]:
images_dir

'C:\\Users\\sande\\.cache\\kagglehub\\datasets\\adityajn105\\flickr8k\\versions\\1\\Images'

In [None]:
features.keys()

In [None]:
features.values()

In [None]:
print("Base dir:", os.getcwd())  
# File ka asli location (root ke hisaab se)

print("Working dir:", os.getcwd())  
# Jaha se tumne program run kiya


Base dir: d:\Programming\Image-Caption-Generator-Using-Deep-Learning\Model\With-Transfer-learning
Working dir: d:\Programming\Image-Caption-Generator-Using-Deep-Learning\Model\With-Transfer-learning


In [None]:
#save features to file
pickle.dump(features,open(os.path.join(os.getcwd(), 'features.pkl'),'wb'))

In [None]:
#load features from pickle
with open(os.path.join(os.getcwd(), 'features.pkl'), 'rb') as f:
    features = pickle.load(f)


Load Caption Data

In [None]:
os.listdir(path)

In [None]:
captions_file

In [None]:
with open(captions_file, 'r') as f:
    next(f)  # Skip the header line
    captions_doc = f.read()

In [None]:
print(captions_doc)

In [None]:
str="1003163366_44323f5815.jpg,a man sleeping on a bench outside with a white and black dog sitting next to him ."

tokens = str.split(',')
print(tokens)
image_id = tokens[0].split('.')[0]
print(image_id) #image id without extension
caption = tokens[1]

In [None]:
mapping = {}
for line in tqdm((captions_doc.split('\n'))):
    tokens = line.split(',')
    if len(line)<2:
        continue
    #remove extension from image id
    image_id, caption = tokens[0].split('.')[0], tokens[1:]
    #convert caption list to string
    caption = ' '.join(caption)
    #create the list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

In [None]:
# mapping = {}
# for line in tqdm(captions_doc.split('\n')):
#     #split the line by comma(,)
#     tokens = line.split(',')
#     if len(line) < 2:
#         continue
#     image_id, caption = tokens[0], tokens[1:]
#     image_id = image_id.split('.')[0]
#     caption = ' '.join(caption)
#     if image_id not in mapping:
#         mapping[image_id] = []
#     mapping[image_id].append(caption)

In [None]:
image_id

In [None]:
mapping.keys()

In [None]:
mapping.values()

In [None]:
len(mapping)

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            #convert to lowercase
            caption = caption.lower()
            #remove punctuation
            caption = caption.replace('[^a-zA-Z]', ' ')
            #remove multiple spaces
            caption = ' '.join(caption.split())
            #add start and end tokens to the caption
            caption = 'startseq ' + caption + ' endseq'
            captions[i] = caption

In [None]:
mapping[image_id]

In [None]:
clean(mapping)

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
#get the maximum length of the captions
max_length = max(len(caption.split()) for caption in all_captions)
max_length

### Train Test plit 

In [None]:
## train test split
image_ids = list(mapping.keys())
split = int(len(image_ids)*0.80)
train_image_ids = image_ids[:split]
test_image_ids = image_ids[split:]

In [None]:
# Create data generator data in batches avoids Session time out

from keras.utils import Sequence
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def DataGenerator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over the data
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X,y pairs
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(features[key][0])   # image features
                    X2.append(in_seq)             # input sequence
                    y.append(out_seq)             # output word
            if n == batch_size:
                # convert to numpy arrays before yielding
                X1 = np.array(X1, dtype=np.float32)
                X2 = np.array(X2, dtype=np.float32)
                y = np.array(y, dtype=np.float32)
                yield [X1, X2], y
                # reset
                X1, X2, y = list(), list(), list()
                n = 0


Model Creation 

In [None]:
#Encoder Model
#image feature extractor model
inputs1 = Input(shape=(4096,))
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)

#sequence model
inputs2=Input(shape=(max_length,))
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

#decoder model
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)
model=Model(inputs=[inputs1,inputs2],outputs=outputs)
model.compile(loss='categorical_crossentropy',optimizer='adam')


# plot_model(model,show_shapes=True)
model.summary()  

In [None]:
#train the model
epochs=15
batch_size=64

steps=len(train_image_ids)//batch_size

for i in range(epochs):

    generator = lambda: DataGenerator(
        train_image_ids, mapping, features, tokenizer, max_length, vocab_size, batch_size
    )

    dataset = tensorflow.data.Dataset.from_generator(
        generator,
        output_signature=(
            (tensorflow.TensorSpec(shape=(None, 4096), dtype=tensorflow.float32),     # X1: VGG16 features
            tensorflow.TensorSpec(shape=(None, max_length), dtype=tensorflow.int32)), # X2: sequence tokens
            tensorflow.TensorSpec(shape=(None, vocab_size), dtype=tensorflow.float32)  # y: one-hot labels
        )
    )

    model.fit(dataset, epochs=1, steps_per_epoch=steps, verbose=1)

