Importing libraries

In [2]:
import json as js
from collections import defaultdict
import nltk
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from keras import layers, optimizers
from keras.models import Model #type: ignore
from keras.layers import Input, LSTM, Dense, TimeDistributed, Embedding #type: ignore

loading paths

In [3]:
with open('Config_RNN.json','r') as file:
    paths = js.load(file)

Loading initial train data

In [3]:
with open(paths["Caption_Train"],'r') as file:
    train_data = js.load(file)

Extracting imp data

In [4]:
newdict = {}
newdict["img_data"] = train_data["images"]
newdict["annotations_data"] = train_data["annotations"]

Creating Corpus

In [5]:
img_data = newdict["img_data"]
annotations_data = newdict["annotations_data"]
captions_dict = defaultdict(list)

for ann in annotations_data:
    captions_dict[ann["image_id"]].append(ann["caption"])

image_caption_data = {}
for img in img_data:
    file_id = img["file_name"]
    img_id = img["id"]
    image_caption_data[file_id] = captions_dict[img_id]

with open(paths["Corpus"], "w") as f:
    js.dump(image_caption_data, f, indent=4)


loading corpus

In [6]:
with open(paths["Corpus"], "r") as f:
    Corpus_data = js.load(f)

Loading nltk dependency

In [7]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rakshit2001/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/rakshit2001/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

Text Tokenization

In [8]:
pos_tagged_captions = {}
noun_captions = {}

for item in tqdm(Corpus_data):
    tagged = []
    for caption in Corpus_data[item]:
        tokens = nltk.word_tokenize(caption)
        pos_tags = nltk.pos_tag(tokens)
        tagged.append(pos_tags)
    pos_tagged_captions[item] = tagged

for image_id, tagged_captions in tqdm(pos_tagged_captions.items()):
    noun_lists = []
    for tagged in tagged_captions:
        nouns = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS')]
        noun_lists.append(nouns)
    noun_captions[image_id] = noun_lists

100%|██████████| 118287/118287 [04:30<00:00, 437.29it/s]
100%|██████████| 118287/118287 [00:02<00:00, 54733.55it/s]


storing noune tags

In [9]:
with open(paths['Nouned_Corpus'],'w') as f:
    js.dump(noun_captions,f)

In [10]:
with open(paths["Corpus"], 'r') as f1:
    captions_data = js.load(f1)

with open(paths['Nouned_Corpus'], 'r') as f2:
    features_data = js.load(f2)

dataset = []

for image_id,nouns in tqdm(features_data.items()): 
    noun_caption = zip(nouns, captions_data[image_id])
    for noun, caption in noun_caption:
        dataset.append({
            "input": " ".join(noun),
            "output": caption
        })

with open(paths["Preprocessed_data"], "w") as out_file:
    js.dump(dataset, out_file, indent=2)

100%|██████████| 118287/118287 [00:01<00:00, 89170.10it/s]


Padding

In [11]:
with open(paths["Preprocessed_data"], 'r') as f:
    data = js.load(f)

max_length = max(len(item['input']) for item in data)

for item in data:
    item['input'] = item['input'].ljust(max_length)

with open(paths["Padded_preprocessed_data"], 'w') as f:
    js.dump(data, f, indent=4)

print("All captions padded to length:", max_length)

All captions padded to length: 158


Loading padded captions

In [4]:
with open(paths["Padded_preprocessed_data"],'r') as f:
    padded_captions = js.load(f)

Build vocabulary and one hot encoding

In [5]:
vocab = set(word for cap in padded_captions for word in cap)
word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx)
one_hot_captions = []

for cap in padded_captions:
    encoded = []

    for word in cap:
        one_hot = [0] * vocab_size
        one_hot[word2idx[word]] = 1
        encoded.append(one_hot)

    one_hot_captions.append(encoded)
    
one_hot_captions = np.array(one_hot_captions)

Embeddings

In [6]:
model = Word2Vec(sentences=padded_captions, vector_size=100, window=5, min_count=1, workers=4)
w2v_captions = []

for cap in padded_captions:
    encoded = [model.wv[word] for word in cap]
    w2v_captions.append(encoded)

Model

In [8]:
input_seq_len = 158
output_seq_len = 30
vector_dim = 158
hidden_units = 256 
encoder_inputs = Input(shape=(input_seq_len, vector_dim))
encoder = LSTM(hidden_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
decoder_inputs = tf.keras.layers.RepeatVector(output_seq_len)(state_h)
decoder_lstm = LSTM(hidden_units, return_sequences=True)
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=[state_h, state_c])
decoder_dense = TimeDistributed(Dense(vector_dim))
final_outputs = decoder_dense(decoder_outputs)
model = Model(inputs=encoder_inputs, outputs=final_outputs)
model.compile(optimizer='adam', loss='mse') 

Model Training

In [9]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(self.units,
                                return_sequences=True,
                                return_state=True,
                                recurrent_initializer='glorot_uniform')
        self.fc = Dense(vocab_size)

    def call(self, x, features, hidden):
        x = self.embedding(x)  
        features = tf.expand_dims(features, 1)  
        features = tf.tile(features, [1, tf.shape(x)[1], 1])  
        x = tf.concat([features, x], axis=-1) 
        output, state_h, state_c = self.lstm(x, initial_state=hidden)
        x = self.fc(output)
        return x, state_h, state_c

    def reset_state(self, batch_size):
        return [tf.zeros((batch_size, self.units)), tf.zeros((batch_size, self.units))]

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

checkpoint_path = paths["Trained_model_RNN"]

def train_model(dataset, decoder, optimizer, epochs=10):
    best_loss = float('inf')
    loss_metric = tf.keras.metrics.Mean()

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        loss_metric.reset_state()

        for step, (img_tensor, target) in enumerate(dataset):
            batch_loss = train_step(img_tensor, target, decoder, optimizer)
            loss_metric.update_state(batch_loss)

            if step % 10 == 0:
                print(f"Step {step}/{len(dataset)} - loss: {batch_loss:.4f}")

        epoch_loss = loss_metric.result().numpy()
        print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            decoder.save_weights(checkpoint_path)
            print(f"Best model saved with loss: {best_loss:.4f}")


In [10]:
vocab_size = 10000
seq_length = 30

vectorizer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    pad_to_max_tokens=True
)

all_captions = ["<start> " + item['output'] + " <end>" for item in padded_captions]
vectorizer.adapt(all_captions)
input_tensor = np.array([np.mean(cap, axis=0) for cap in w2v_captions])
target_tensor = vectorizer(tf.constant(all_captions)).numpy()
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor))
train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE)
embedding_dim = 100
units = 512
vocab_size = vectorizer.vocabulary_size()
decoder = RNN_Decoder(embedding_dim, units, vocab_size)
optimizer = tf.keras.optimizers.Adam()
start_token = vectorizer('<start>').numpy()[0]

def loss_function(real, pred):
    loss_ = loss_object(real, pred)
    mask = tf.math.not_equal(real, 0)
    mask = tf.cast(mask, loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


@tf.function
def train_step(img_tensor, target, decoder, optimizer):
    loss = 0
    with tf.GradientTape() as tape:
        dec_input = target[:, :-1]   
        real_output = target[:, 1:]  
        predictions, _, _ = decoder(dec_input, img_tensor, decoder.reset_state(tf.shape(img_tensor)[0]))
        loss = loss_function(real_output, predictions)
    gradients = tape.gradient(loss, decoder.trainable_variables)
    optimizer.apply_gradients(zip(gradients, decoder.trainable_variables))
    return loss

def train_model(dataset, decoder, optimizer, epochs=10):
    for epoch in range(epochs):
        total_loss = 0
        for (batch, (img_tensor, target)) in enumerate(tqdm(dataset)):
            batch_loss = train_step(img_tensor, target, decoder, optimizer)
            total_loss += batch_loss

        print(f'Epoch {epoch+1}, Loss: {total_loss / (batch + 1):.4f}')


train_model(train_dataset, decoder, optimizer, epochs=10)

  0%|          | 0/9247 [00:00<?, ?it/s]2025-04-23 16:55:37.483498: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90101
100%|█████████▉| 9245/9247 [05:21<00:00, 28.90it/s]2025-04-23 17:00:58.087644: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
100%|██████████| 9247/9247 [05:21<00:00, 28.74it/s]


Epoch 1, Loss: 1.3176


100%|█████████▉| 9245/9247 [05:21<00:00, 28.89it/s]2025-04-23 17:06:19.523000: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
100%|██████████| 9247/9247 [05:21<00:00, 28.77it/s]


Epoch 2, Loss: 1.1203


100%|██████████| 9247/9247 [05:21<00:00, 28.74it/s]


Epoch 3, Loss: 1.0724


100%|██████████| 9247/9247 [05:21<00:00, 28.69it/s]2025-04-23 17:17:02.892447: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
100%|██████████| 9247/9247 [05:21<00:00, 28.75it/s]


Epoch 4, Loss: 1.0441


100%|██████████| 9247/9247 [05:20<00:00, 28.89it/s]


Epoch 5, Loss: 1.0236


 84%|████████▎ | 7743/9247 [04:28<00:52, 28.83it/s]


KeyboardInterrupt: 