importing libraries

In [1]:
import json as js
from collections import defaultdict
import nltk
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from keras import layers, optimizers
from keras.models import Model #type: ignore
from keras.layers import Input, LSTM, Dense, TimeDistributed, Embedding #type: ignore

2025-04-23 21:57:39.918122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-23 21:57:39.929205: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-23 21:57:39.932470: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-23 21:57:39.940812: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


loading paths

In [2]:
with open('Config_RNN.json','r') as file:
    paths = js.load(file)

loading padded captions

In [3]:
with open(paths["Padded_preprocessed_data"],'r') as f:
    padded_captions = js.load(f)

Build vocabulary and one hot encoding

In [4]:
vocab = set(word for cap in padded_captions for word in cap)
word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx)
one_hot_captions = []

for cap in padded_captions:
    encoded = []

    for word in cap:
        one_hot = [0] * vocab_size
        one_hot[word2idx[word]] = 1
        encoded.append(one_hot)

    one_hot_captions.append(encoded)
    
one_hot_captions = np.array(one_hot_captions)

Embeddings

In [5]:
model = Word2Vec(sentences=padded_captions, vector_size=100, window=5, min_count=1, workers=4)
w2v_captions = []

for cap in padded_captions:
    encoded = [model.wv[word] for word in cap]
    w2v_captions.append(encoded)

Model

In [6]:
input_seq_len = 158
output_seq_len = 30
vector_dim = 158
hidden_units = 256 
encoder_inputs = Input(shape=(input_seq_len, vector_dim))
encoder = LSTM(hidden_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
decoder_inputs = tf.keras.layers.RepeatVector(output_seq_len)(state_h)
decoder_lstm = LSTM(hidden_units, return_sequences=True)
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=[state_h, state_c])
decoder_dense = TimeDistributed(Dense(vector_dim))
final_outputs = decoder_dense(decoder_outputs)
model = Model(inputs=encoder_inputs, outputs=final_outputs)
model.compile(optimizer='adam', loss='mse') 

I0000 00:00:1745425974.514704   23644 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1745425974.561864   23644 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1745425974.571823   23644 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1745425974.578587   23644 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Model Training

In [None]:
import os
from keras.layers import TextVectorization
# ========== SETUP ==========
vocab_size = 10000
seq_length = 30
embedding_dim = 100
units = 512
BATCH_SIZE = 64
EPOCHS = 10
# Sample input: padded_captions = [{'output': 'a man riding a bike'}, ...]
# Sample input: w2v_captions = [np.array([...]), np.array([...]), ...]
# ========== TEXT VECTORIZATION ==========
vectorizer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    pad_to_max_tokens=True
)
all_captions = ["<start> " + item['output'] + " <end>" for item in padded_captions]
vectorizer.adapt(all_captions)

input_tensor = np.array([np.mean(cap, axis=0) for cap in w2v_captions])
target_tensor = vectorizer(tf.constant(all_captions)).numpy()

# ========== DATA SPLIT ==========
split_index = int(0.8 * len(input_tensor))
train_dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor[:split_index], target_tensor[:split_index])
).shuffle(1000).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor[split_index:], target_tensor[split_index:])
).batch(BATCH_SIZE)

# ========== MODEL SETUP ==========
import tensorflow as tf
from keras import layers
import numpy as np
from tqdm import tqdm

# Define the Decoder
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(self.units,
                              return_sequences=True,
                              return_state=True,
                              recurrent_initializer='glorot_uniform')
        self.fc1 = layers.Dense(self.units)
        self.fc2 = layers.Dense(vocab_size)

    def call(self, x, features, hidden):
        x = self.embedding(x)
        features = tf.expand_dims(features, 1)
        features = tf.tile(features, [1, tf.shape(x)[1], 1])  # Match sequence length
        x = tf.concat([features, x], axis=-1)
        output, state = self.gru(x, initial_state=hidden)
        x = self.fc1(output)
        x = self.fc2(x)
        return x, state, None

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

# Loss and optimizer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.not_equal(real, 0)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

@tf.function
def train_step(img_tensor, target, decoder, optimizer):
    loss = 0
    batch_size = tf.shape(img_tensor)[0]
    hidden = decoder.reset_state(batch_size)
    dec_input = target[:, :-1]
    real = target[:, 1:]

    with tf.GradientTape() as tape:
        predictions, _, _ = decoder(dec_input, img_tensor, hidden)
        loss = loss_function(real, predictions)

    trainable_variables = decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss

@tf.function
def validation_step(img_tensor, target, decoder):
    loss = 0
    batch_size = tf.shape(img_tensor)[0]
    hidden = decoder.reset_state(batch_size)
    dec_input = target[:, :-1]
    real = target[:, 1:]

    predictions, _, _ = decoder(dec_input, img_tensor, hidden)
    loss = loss_function(real, predictions)

    return loss

def train_model(train_dataset, val_dataset, decoder, optimizer, epochs, save_path):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        total_loss = 0
        for img_tensor, target in tqdm(train_dataset, desc="Training"):
            batch_loss = train_step(img_tensor, target, decoder, optimizer)
            total_loss += batch_loss

        print(f"Training Loss: {total_loss/len(train_dataset):.4f}")

        total_val_loss = 0
        for img_tensor, target in tqdm(val_dataset, desc="Validating"):
            batch_val_loss = validation_step(img_tensor, target, decoder)
            total_val_loss += batch_val_loss

        val_loss = total_val_loss / len(val_dataset)
        print(f"Validation Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            print("Validation loss improved. Saving model.")
            decoder.save_weights(save_path)
            best_val_loss = val_loss
        else:
            print("Validation loss did not improve.")