In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

import random
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split ## to split data set into train, test, valid
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import Tokenizer ## to create mapped number list for each sentence
from tensorflow.keras.preprocessing.sequence import pad_sequences ## to pad the sentences in order to keep all the sentence in same length.4
import os ## to have functions related to os directory

from tensorflow.keras.preprocessing import image  ## to preprocess and collect featureset for each image using the inceptionv3 model

In [None]:
np.set_printoptions(threshold=np.inf)


In [None]:
captions_dict = defaultdict(list) ## creating a dictionary of key: list pairs

In [None]:
def clean_caption(caption):
    caption = caption.lower() ## Converting the text letters to lower case
    caption = caption.strip()
    caption =  re.sub(r'[^\w\s]', '', caption)
    return f"<start>{caption} <end>" ## including <start> and <end> tags in your captions is exactly what we need for training a sequence-to-sequence model like in image captioning.

In [None]:

## Reading from the text file 
with open("captions.txt","r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip() ## To rermove leading and trailing whitespace
        if not line:
            continue
        name, caption = line.split(",",1) ## Split only at the first encouonter of comma
        cleanCap= clean_caption(caption)
        captions_dict[name].append(cleanCap)
    

## Tasks of the Model;;
--> to identify distinct objects in the oimage and name
--> form a sentence by learning language from training sentence
--> Map the sentence to the image

1.) Feature Extraction from Images
2.) Tokenising 

###  Image Feature Extraction using Transfer Learning

In [None]:
## training data captions need to be tokenized
imageFeatureExt = InceptionV3(include_top = False, weights = 'imagenet')
# Add a global average pooling layer to flatten the output
x = imageFeatureExt.output
x = GlobalAveragePooling2D()(x)

# Define the new model
model = Model(inputs=imageFeatureExt.input, outputs=x)

In [None]:
### Freeze the base model.. from being trained 
imageFeatureExt.trainable = False

In [None]:
## To extract feature set of Each Image via Inception v3 model
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size = (299, 299))  # inception v3 model expects input of size 299.299
    img_arr = image.img_to_array(img)
    img_array = np.expand_dims(img_arr, axis=0)  ## to add one more column of batch ;; because tensorflow models expects all the input in batc format
    img_array = preprocess_input(img_array) ## the model needs right range and distribution of pixels.. before processing
    return img_array

In [None]:
## Replacing the image names with the feature set array that can be extracted using the InceptionV3 model
imgFScaption = []

for img, captions in captions_dict.items():
    if img != "image":
        path = os.path.join("./Images/",img)
        if os.path.exists(path):
            print(path)
            fs = preprocess_image(path)
            features = model.predict(fs, verbose=0)[0] ## the output will be of batch type.. so extracting only the first feature set
            for cap in captions:
                imgFScaption.append((features,cap))

## each line would be of image_name and caption

In [None]:
len(imgFScaption)

Why split before tokenization?
To avoid data leakage.

You want the tokenizer to learn only from the training captions.

If you fit it on all captions (train + val), the model might "peek" into unseen data indirectly — which defeats the purpose of validation.

In [None]:
## Shuffling data of caption list

random.shuffle(imgFScaption)

In [None]:
## splitting data 

train, temp = train_test_split(imgFScaption, test_size =0.2, random_state = 42)

valid, test = train_test_split(temp, test_size = 0.5 , random_state=42)

In [None]:
len(train[0][0])

### Tokenizing

In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token = '<unk>') ## the tokens having first 5000 highest frequency would be considered and others are replaces as '<unk>'

In [None]:
def padding(imgFScaption, train=0):
    ## creating a list consisting only of the caption  of all the images
    all_captions = [ caption for _, caption in imgFScaption]## collect only the cpation part from the tuple imge: caption
    if train == 1: ## to fit the toknizer model only on the training data, we are keeping a flag "train" to indicate training data
        tokenizer.fit_on_texts(all_captions) ##builds vocabulary for the mentioned captions
    sequences = tokenizer.texts_to_sequences(all_captions) ## convert the text t sequence of numbers
    ## padding the sequences to be of same length
    max_length = max(len(seq) for seq in sequences)## find max length among thr sequences
    print(max_length)
    padded_sequences = pad_sequences(sequences, maxlen= max_length,padding = 'post')
    # Now need to pair up the captions with the respective images
    FS_caption = [(fs, padseq) for (fs, _) ,padseq in zip(imgFScaption, padded_sequences)]
    return max_length,FS_caption

In [None]:
# Assuming `tokenizer` is already trained
vocab = tokenizer.word_index  # word -> index mapping
print(f"Vocabulary size: {len(vocab)}")

# To see the top 20 most frequent words
for word, idx in sorted(vocab.items(), key=lambda item: item[1])[:]:
    print(f"{idx}: {word}")


In [None]:
max_len, FS_caption = padding(train, 1)


In [None]:
vz = len(vocab)

In [None]:
print(FS_caption[0])

### Tokenizing and feature extraction are completed 
######
####
##
### Now Build the image captioning Model 

           Image                             Partial Caption
       -------------                         -----------------
      | CNN (InceptionV3) |                 | Embedding Layer |
       -------------                         -----------------
              |                                      |
     Dense Layer (feature vector)              LSTM Layer
              |                                      |
              |-------- Concatenate -----------------|
                            |
                        Dense Layer
                            |
                    Softmax (Vocab Size)
                            ↓
                Predict Next Word in Caption


In [None]:
## Dense layer after Feature Extraction
image_input = Input(shape=(2048,),name = 'image_input')
caption_input = Input(shape = (max_len,), name = 'caption_input')
img_dense = Dense(256, activation= 'relu')(image_input)

### Embedding Layer and LSTM Layer

In [None]:

embedding = Embedding(input_dim=vz, output_dim=256, mask_zero=True)(caption_input)
lstm_out = LSTM(256)(embedding)

## Combining these models into a sequence

In [None]:
concatenate = Concatenate()([img_dense, lstm_out])

In [None]:
final_dense = Dense(256, activation ='relu')(concatenate)
final_dense = Dropout(0.5)(final_dense)
output= Dense(vz, activation ='softmax')(final_dense)

### Now combining altogether and training model

In [None]:
img_inp= np.array([pair[0] for pair in FS_caption])
cap_inp = np.array([pair[1] for pair in FS_caption])

In [None]:
model = Model(inputs= [image_input, caption_input], outputs = output)

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
def data_generator(FS_caption, max_len, batch_size,vz):
    while True:
        X1, X2, y = [], [], []
        for img_feat, caption in FS_caption:
            for i in range(1, len(caption)):
                in_seq = caption[:i]
                out_word = caption[i]
                
                in_seq_padded = pad_sequences([in_seq], maxlen=max_len, padding='post')[0]
                
                X1.append(img_feat)
                X2.append(in_seq_padded)
                y.append(out_word)
                
                if len(X1) == batch_size:
                    yield (np.array(X1), np.array(X2)), tf.keras.utils.to_categorical(y, num_classes=vz)
                    X1, X2, y = [], [], []


In [None]:
bs = 50

In [None]:
output_signature = (
    (
        tf.TensorSpec(shape=(None, 2048), dtype=tf.float16),     # image features
        tf.TensorSpec(shape=(None, max_len), dtype=tf.int32)     # padded caption input
    ),
    tf.TensorSpec(shape=(None,vz ), dtype=tf.float32)    # one-hot encoded next word
)


In [None]:
for data in data_generator(FS_caption,max_len, bs,vz):
    print("Image shape:", data[0][0].shape)
    print("Caption input shape:", data[0][1])
    print("Label shape:", data[1].shape)
    break


In [None]:
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(FS_caption, max_len, bs,vz),
    output_signature=output_signature
)


In [None]:
steps = sum(len(caption)-1 for _, caption in FS_caption) // bs

In [None]:
model.fit(dataset, epochs=20, steps_per_epoch=steps)


In [None]:
### Saving the model

In [None]:
model.save("img_captioning_model.keras")

## Model Evaluation

In [None]:
max_len_valid, valid = padding(valid,0)

In [None]:
valid_signature = (
    (
        tf.TensorSpec(shape=(None, 2048), dtype=tf.float16),     # image features
        tf.TensorSpec(shape=(None, max_len), dtype=tf.int32)     # padded caption input
    ),
    tf.TensorSpec(shape=(None,vz ), dtype=tf.float32)    # one-hot encoded next word
)


In [None]:
for data in data_generator(valid,max_len, bs):
    print("Image shape:", data[0][0].shape)
    print("Caption input shape:", data[0][1])
    print("Label shape:", data[1].shape)
    break


In [None]:
dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(valid, max_len, bs),
    output_signature=valid_signature
)


In [None]:
steps = sum(len(caption)-1 for _, caption in valid) // bs

In [None]:
loss, acc = model.evaluate(dataset,steps = steps)

In [None]:

model1 = Model(inputs=imageFeatureExt.input, outputs=x)

In [None]:
def generate_caption(model, tokenizer, photo, max_length):
    in_text = '<start>'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('end', '').strip()


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

def display_image_and_caption(image_path, model, tokenizer, max_length, feature_extractor):
    # Extract features from image (if not already done)
    image = Image.open(image_path)
    plt.imshow(image)
    plt.axis("off")
    
    photo = feature_extractor(image_path)  # replace with your feature extractor
    photo = model1.predict(photo, verbose=0)[0]
    photo = np.expand_dims(photo, axis=0)  # add batch dimension

    # Generate caption
    caption = generate_caption(model, tokenizer, photo, max_length)
    
    plt.title(caption)
    plt.show()


In [None]:
display_image_and_caption("test.jpg", model, tokenizer, max_len, preprocess_image)
