In [3]:
import os
import pickle
import numpy as np
import tensorflow as tf
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add

In [5]:
BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'

## Extract Image Feature

In [54]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model

# Path to the manually downloaded weights file
weights_path = 'C:\\Users\\hp\\.keras\\models'

# Load VGG16 model with the specified weights file
model = VGG16(weights=None)

# Restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# Summary
print(model.summary())


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
features={}
directory = os.path.join(BASE_DIR,'Images')

for img_name in tqdm(os.listdir(directory)):
    img_path = directory + '/' + img_name
    image = load_img(img_path , target_size = (224,224))
    
    image = img_to_array(image)
    
    image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose =0)
    image_id = img_name.split('.')[0]
    features[image_id] = feature


  0%|          | 0/8091 [00:00<?, ?it/s]

In [None]:
pickle.dump(features,open(os.path.join(WORKING_DIR, 'features.pkl'),'wb'))

In [None]:
with open(os.path.join(WORKING_DIR,'features.pkl'),'rb') as f:
    features=pickle.load(f)

## Load the Captions Data

In [29]:
with open(os.path.join(BASE_DIR,'captions.txt'),'r') as f:
    next(f)
    captions_doc=f.read()

In [30]:
mapping = {}
#process
for line  in tqdm(captions_doc.split('\n')):
    tokens=line.split(',')
    if len(line)<2:
        continue
    image_id,caption = tokens[0],tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)    
        
    

  0%|          | 0/40456 [00:00<?, ?it/s]

In [16]:
len(mapping)

8091

## Preprocessing

In [39]:
def clean (mapping):
    for key,captions in mapping.items():
        for i in range(len(captions)):
            caption=captions[i]
            #convert to lowercase
            caption=caption.lower()
            #delete digits , special characters etc
            caption=caption.replace('[^A-Za-z]','')
            #delete additional spaces
            caption=caption.replace('\s+', ' ')
            #add start and end tags
            caption = "start" + " "+ " ".join([word for word in caption.split() if len(word)>1]) + " "+ "end"
            captions[i]=caption
            
            
            
            

In [40]:
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [41]:
clean(mapping)

In [21]:
id = mapping['1000268201_693b08cb0e']

In [None]:
img = mpimg.imread(id)
plt.imshow(img)
plt.axis('off')  # Turn off axis labels
plt.show()

In [33]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [42]:
all_captions[:10]

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .',
 'A black dog and a spotted dog are fighting',
 'A black dog and a tri-colored dog playing with each other on the road .',
 'A black dog and a white dog with brown spots are staring at each other in the street .',
 'Two dogs of different breeds looking at each other on the road .',
 'Two dogs on pavement moving toward each other .']

In [42]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [43]:
max_length = max(len(caption.split()) for caption in all_captions)
max_length

33

## Train Test Split

In [44]:
image_ids=list(mapping.keys())
split = int(len(image_ids)*0.90)
train = image_ids[:split]
test=image_ids[split:]

In [39]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    n = 0
    X1, X2, y = None, None, []
    
    while True:
        for key in data_keys:
            n += 1
            captions = mapping[key]

            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]

                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    if X1 is None:
                        X1 = np.array([features[key][0]])
                    else:
                        X1 = np.concatenate([X1, np.array([features[key][0]])], axis=0)

                    in_seq = np.expand_dims(in_seq, axis=0)

                    if X2 is not None and X2.shape[1] == in_seq.shape[1]:
                        X2 = np.vstack([X2, in_seq])
                    else:
                        X2 = in_seq

                    y.append(out_seq)

            if n == batch_size:
                yield [X1, X2], np.array(y)
                X1, X2, y = None, None, []
                n = 0


## Model Creation

In [55]:
#save
model.save(WORKING_DIR+'/best_model.h5')

  saving_api.save_model(


## Generate Captions for the Image

In [37]:
def idx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index == integer:
            return word
    return none    

In [38]:
def index_to_word(index, tokenizer):
    for word, idx in tokenizer.word_index.items():
        if idx == index:
            return word

In [35]:
def predict_caption(model, image, tokenizer, max_length):
    in_text='start'
    for i in range(max_length):
        sequence=tokenizer.texts_to_sequences([in_text])[0]
        sequence=pad_sequences([sequence],max_length)[0]
        yhat=model.predict([image,sequence], verbose=0)
        yhat=np.argmax(yhat)
        word=idx_to_word(yhat,tokenizer)
        if word is None:
            break
        in_text += " "+ word
        if word == 'end':
            break
    return in_text       
        
    
       
        
    

In [152]:
from nltk.translate.bleu_score import corpus_bleu
actual,predicted = list(),list()

for key in tqdm(test):
    captions=mapping[key]
    y_pred=predict_caption(model,features[key],tokenizer,max_length)
    actual_captions=[caption.split() for caption in captions]
    y_pred=y_pred.split()
    actual.append(actual_captions)
    predicted.append(y_pred)
    
print("BLEU-1: %f" % corpus_bleu(actual,predicted,weights=(1.0,0,0,0)))
print("BLEU-2: %f" % corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0)))

  0%|          | 0/810 [00:00<?, ?it/s]

BLEU-1: 0.191293
BLEU-2: 0.099497


## Visualization

In [51]:
from PIL import Image
import matplotlib.pyplot as plt


def get_captions_for_image(image_name):
    # Your code to retrieve captions for the image
    # Replace this with your actual implementation
    return ["Caption 1", "Caption 2", "Caption 3"]

def generate_captions(image_name):
    img_path = '/kaggle/input/flickr8k/Images/' + image_name
    image = Image.open(img_path)
    
    # Get captions for the image
    captions = get_captions_for_image(image_name)
    
    print("---------------Actual----------------------")
    for caption in captions:
        print(caption)
    
    # Assuming you have a function to predict captions
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print(y_pred)
    plt.imshow(image)

    
    
  
    
    
    


In [56]:
generate_captions("1007320043_627395c3d8.jpg")  




---------------Actual----------------------
Caption 1
Caption 2
Caption 3


NameError: name 'features' is not defined