In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import re

import string

from math import ceil
from collections import defaultdict
from tqdm.notebook import tqdm 

In [5]:
# Setting the input and output directory
INPUT_DIR = r'C:\Users\shrut\Image Captioning\archive'
OUTPUT_DIR = r'C:\Users\shrut\Image Captioning'

In [3]:
data = pd.read_csv(r'archive\captions.csv')
data.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [4]:
data.shape

(40563, 2)

# Data Cleaning

In [5]:
data.isnull().sum()

image      27
caption    21
dtype: int64

In [6]:
data=data.dropna()
data.shape

(40515, 2)

In [7]:
data=data.drop_duplicates()

In [8]:
data['image'].isnull().sum()

0

In [9]:
data['caption'].isnull().sum()

0

In [10]:
#converting text to lower case
data['caption']=data['caption'].str.lower()
data.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,a little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,a little girl in a pink dress going into a woo...


In [11]:
#removing punchuations
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data['caption']=data['caption'].apply(remove_punctuation)

In [12]:
#removing single letter words eg. 'a' 
def remove_s(text):
    return re.sub(r'\b[a-zA-Z]\b', '', text)

data['caption']=data['caption'].apply(lambda x: ' '.join(remove_s(word) for word in x.split()))

In [13]:
data.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,child in pink dress is climbing up set of s...
1,1000268201_693b08cb0e.jpg,girl going into wooden building
2,1000268201_693b08cb0e.jpg,little girl climbing into wooden playhouse
3,1000268201_693b08cb0e.jpg,little girl climbing the stairs to her playhouse
4,1000268201_693b08cb0e.jpg,little girl in pink dress going into wooden...


In [14]:
#removing all the words with no. in them
def remove_numbers(text):
    return re.sub(r'\b\w*\d\w*\b', '', text)

data['caption']=data['caption'].apply(remove_numbers)

In [32]:
#Checking for spelling mistakes

import nltk
nltk.download('words')
from nltk.corpus import words

english_words = set(words.words())

def filter_spelling_mistakes(text):
    return ' '.join(word for word in text.split() if word.lower() in english_words)

data['caption']=data['caption'].apply(filter_spelling_mistakes)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\shrut\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


NameError: name 'data' is not defined

In [16]:
data.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,child in pink dress is climbing up set of in a...
1,1000268201_693b08cb0e.jpg,girl going into wooden building
2,1000268201_693b08cb0e.jpg,little girl climbing into wooden playhouse
3,1000268201_693b08cb0e.jpg,little girl climbing the to her playhouse
4,1000268201_693b08cb0e.jpg,little girl in pink dress going into wooden cabin


In [17]:
def final(text):
    return 'startseq ' + ' '.join([word for word in text.split() if len(word) > 1]) + ' endseq'

data['caption']=data['caption'].apply(final)

In [18]:
data.to_csv("captions__.txt", sep=',', index=False)

In [19]:
data['caption'][1]

'startseq girl going into wooden building endseq'

# Loading VGG Model


In [31]:

import tensorflow as tf


from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, concatenate, Bidirectional, Dot, Activation, RepeatVector, Multiply, Lambda


In [21]:
model = VGG16()

In [22]:
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

print(model.summary())


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

# Image Processing

In [30]:
import os
from tqdm import tqdm

from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input

In [24]:
image_features = {}

img_dir = os.path.join(INPUT_DIR, 'Images')

In [25]:
img_dir

'C:\\Users\\shrut\\Image Captioning\\archive\\Images'

In [49]:
# Iterate over the files in the directory
with tqdm(os.listdir(img_dir)) as pbar:
    for img_name in pbar:
        # Updating tqdm with the current filename
        pbar.set_description(f'Processing {img_name}')
        
        # Loading the image from file
        img_path = os.path.join(img_dir, img_name)
        image = load_img(img_path, target_size=(224, 224))
        
        # Converting image pixels to a numpy array
        image = img_to_array(image)
        
        # Reshapeing the data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        
        image = preprocess_input(image)
        # Extracting features using the pre-trained VGG16 model
        image_feature = model.predict(image, verbose=0)
        
        image_id = img_name.split('.')[0]
        
        image_features[image_id] = image_feature

Processing 997722733_0cb5439472.jpg: 100%|██████████| 8091/8091 [59:25<00:00,  2.27it/s]   


In [3]:
import pickle

In [63]:
# Storing the image features in pickle
pickle.dump(image_features, open(os.path.join(OUTPUT_DIR, 'img_features.pkl'), 'wb'))

In [22]:
pickle_file_path = os.path.join(OUTPUT_DIR, 'img_features.pkl')
with open(pickle_file_path, 'rb') as file:
    loaded_features = pickle.load(file)

### Loading caption data

In [29]:
from collections import defaultdict

In [6]:
path=r'C:\Users\shrut\Image Captioning'

with open(os.path.join(path, 'captions__.txt'), 'r') as file:
    next(file)
    captions_doc = file.read()

### Image to Caption Maping

In [7]:
image_to_captions_mapping = defaultdict(list)

# Processing lines from captions_doc
for line in tqdm(captions_doc.split('\n')):
    # Spliting the line by comma(,)
    tokens = line.split(',')
    if len(tokens) < 2:
        continue
    image_id, *captions = tokens
    # Removing extension from image ID
    image_id = image_id.split('.')[0]
    # Converting captions list to string
    caption = " ".join(captions)
    # Storing the caption using defaultdict
    image_to_captions_mapping[image_id].append(caption)


total_captions = sum(len(captions) for captions in image_to_captions_mapping.values())
print("Total number of captions:", total_captions)

100%|██████████| 40446/40446 [00:00<00:00, 491113.32it/s]

Total number of captions: 40445





In [8]:
image_to_captions_mapping['1026685415_0431cbf574']

['startseq black dog green toy in his mouth as he through the grass endseq',
 'startseq black dog carrying something through the grass endseq',
 'startseq black dog blue toy in its mouth endseq',
 'startseq dog in grass with blue item in his mouth endseq',
 'startseq wet black dog is carrying green toy through the grass endseq']

In [9]:
# Creating a List of All Captions
all_captions = [caption for captions in image_to_captions_mapping.values() for caption in captions]
all_captions[:10]

['startseq child in pink dress is climbing up set of in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tricolored dog with each other on the road endseq',
 'startseq black dog and white dog with brown are staring at each other in the street endseq',
 'startseq two dogs of different looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']

# Tokenizing the Text

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

In [12]:
# Save the tokenizer
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

In [13]:
# Calculate maximum caption length
max_caption_length = max(len(tokenizer.texts_to_sequences([caption])[0]) for caption in all_captions)
vocab_size = len(tokenizer.word_index) + 1


print("Vocabulary Size:", vocab_size)
print("Maximum Caption Length:", max_caption_length)

Vocabulary Size: 4862
Maximum Caption Length: 29


# Spliting data for Training and Testing

In [14]:
# Creating a List of Image IDs
image_ids = list(image_to_captions_mapping.keys())
# Splitting into Training and Test Sets
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [15]:
# Data generator function
def data_generator(data_keys, image_to_captions_mapping, features, tokenizer, max_caption_length, vocab_size, batch_size):
    # Lists to store batch data
    X1_batch, X2_batch, y_batch = [], [], []
    # Counter for the current batch size
    batch_count = 0

    while True:
        # Loop through each image in the current batch
        for image_id in data_keys: 
            # Get the captions associated with the current image
            captions = image_to_captions_mapping[image_id]

            # Loop through each caption for the current image
            for caption in captions:
                # Convert the caption to a sequence of token IDs
                caption_seq = tokenizer.texts_to_sequences([caption])[0]

                # Loop through the tokens in the caption sequence
                for i in range(1, len(caption_seq)):
                    # Split the sequence into input and output pairs
                    in_seq, out_seq = caption_seq[:i], caption_seq[i]

                    # Pad the input sequence to the specified maximum caption length
                    in_seq = pad_sequences([in_seq], maxlen=max_caption_length)[0]

                    # Convert the output sequence to one-hot encoded format
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # Append data to batch lists
                    X1_batch.append(features[image_id][0])  # Image features
                    X2_batch.append(in_seq)  # Input sequence
                    y_batch.append(out_seq)  # Output sequence

                    # Increase the batch counter
                    batch_count += 1

                    # If the batch is complete, yield the batch and reset lists and counter
                    if batch_count == batch_size:
                        X1_batch, X2_batch, y_batch = np.array(X1_batch), np.array(X2_batch), np.array(y_batch)
                        yield [X1_batch, X2_batch], y_batch
                        X1_batch, X2_batch, y_batch = [], [], []
                        batch_count = 0

# LSTM model training

In [16]:
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical, plot_model

In [20]:
# Encoder model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
fe2_projected = RepeatVector(max_caption_length)(fe2)
fe2_projected = Bidirectional(LSTM(256, return_sequences=True))(fe2_projected)

# Sequence feature layers
inputs2 = Input(shape=(max_caption_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(256, return_sequences=True))(se2)

# Apply attention mechanism using Dot product
attention = Dot(axes=[2, 2])([fe2_projected, se3])  # Calculate attention scores

# Softmax attention scores
attention_scores = Activation('softmax')(attention)

# Apply attention scores to sequence embeddings
attention_context = Lambda(lambda x: tf.einsum('ijk,ijl->ikl', x[0], x[1]))([attention_scores, se3])

# Sum the attended sequence embeddings along the time axis
context_vector = tf.reduce_sum(attention_context, axis=1)

# Decoder model
decoder_input = concatenate([context_vector, fe2], axis=-1)
decoder1 = Dense(256, activation='relu')(decoder_input)
outputs = Dense(vocab_size, activation='softmax')(decoder1)

# Create the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

# Visualize the model
#plot_model(model, show_shapes=True)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 29)]         0           []                               
                                                                                                  
 dense (Dense)                  (None, 256)          1048832     ['dropout[0][0]']                
                                                                                              

In [23]:
# Set the number of epochs, batch size
epochs = 5
batch_size = 32

# Calculate the steps_per_epoch based on the number of batches in one epoch
steps_per_epoch = ceil(len(train) / batch_size)
validation_steps = ceil(len(test) / batch_size)  # Calculate the steps for validation data

# model.compile(optimizer, loss)
# Loop through the epochs for training
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Set up data generators
    train_generator = data_generator(train, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
    test_generator = data_generator(test, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
    
    model.fit(train_generator, epochs=1, steps_per_epoch=steps_per_epoch, validation_data=test_generator, validation_steps=validation_steps, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
# Save the model
model.save(OUTPUT_DIR+'/model.h5')

# Captions Generation

In [25]:
from nltk.translate.bleu_score import corpus_bleu
from PIL import Image

In [26]:
def get_word_from_index(index, tokenizer):
    return next((word for word, idx in tokenizer.word_index.items() if idx == index), None)

In [27]:
def predict_caption(model, image_features, tokenizer, max_caption_length):
    # Initialize the caption sequence
    caption = 'startseq'
    
    # Generate the caption
    for _ in range(max_caption_length):
        # Convert the current caption to a sequence of token indices
        sequence = tokenizer.texts_to_sequences([caption])[0]
        # Pad the sequence to match the maximum caption length
        sequence = pad_sequences([sequence], maxlen=max_caption_length)
        # Predict the next word's probability distribution
        yhat = model.predict([image_features, sequence], verbose=0)
        # Get the index with the highest probability
        predicted_index = np.argmax(yhat)
        # Convert the index to a word
        predicted_word = get_word_from_index(predicted_index, tokenizer)
        
        # Append the predicted word to the caption
        caption += " " + predicted_word
        
        # Stop if the word is None or if the end sequence tag is encountered
        if predicted_word is None or predicted_word == 'endseq':
            break
    
    return caption

In [35]:
pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Downloading comm-0.2.1-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.10 (from ipywidgets)
  Downloading widgetsnbextension-4.0.10-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.10 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.2-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.4 kB ? eta -:--:--
   -------- ------------------------------- 30.7/139.4 kB 1.4 MB/s eta 0:00:01
   ----------------------- ---------------- 81.9/139.4 kB 1.2 MB/s eta 0:00:01
   -------------------------------------- - 133.1/139.4 kB 1.1 MB/s eta 0:00:01
   -------------------------------------- 139.4/139.4 kB 922.6 kB/s eta 0:00:00
Downloading comm-0.2.1-py3-none-any.whl (7.2 kB)
Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl (

In [37]:
pip install --upgrade notebook


Collecting notebook
  Downloading notebook-7.1.1-py3-none-any.whl.metadata (10 kB)
Collecting jupyterlab<4.2,>=4.1.1 (from notebook)
  Downloading jupyterlab-4.1.4-py3-none-any.whl.metadata (15 kB)
Collecting httpx>=0.25.0 (from jupyterlab<4.2,>=4.1.1->notebook)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx>=0.25.0->jupyterlab<4.2,>=4.1.1->notebook)
  Downloading httpcore-1.0.4-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.25.0->jupyterlab<4.2,>=4.1.1->notebook)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collect

In [38]:
pip install --upgrade ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [40]:
from tqdm import tqdm

In [None]:
# Initialize lists to store actual and predicted captions
actual_captions_list = []
predicted_captions_list = []

# Loop through the test data
for key in tqdm(test):
    # Get actual captions for the current image
    actual_captions = image_to_captions_mapping[key]
    # Predict the caption for the image using the model
    predicted_caption = predict_caption(model, loaded_features[key], tokenizer, max_caption_length)
    
    # Split actual captions into words
    actual_captions_words = [caption.split() for caption in actual_captions]
    # Split predicted caption into words
    predicted_caption_words = predicted_caption.split()
    
    # Append to the lists
    actual_captions_list.append(actual_captions_words)
    predicted_captions_list.append(predicted_caption_words)


In [44]:

# Calculate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual_captions_list, predicted_captions_list, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual_captions_list, predicted_captions_list, weights=(0.5, 0.5, 0, 0)))

BLEU-1: 0.033663
BLEU-2: 0.002394


In [48]:
# Function for generating caption
def generate_caption(image_name):
    # load the image
    image_id = image_name.split('.')[0]
    img_path = os.path.join(INPUT_DIR, "Images", image_name)
    image = Image.open(img_path)
    captions = image_to_captions_mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, loaded_features[image_id], tokenizer, max_caption_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)