importing libraries

In [1]:
import json as js
from collections import defaultdict
import nltk
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from keras import layers, optimizers
from keras.models import Model #type: ignore
from keras.layers import Input, LSTM, Dense, TimeDistributed, Embedding #type: ignore
from keras.models import Sequential

loading paths

In [2]:
with open('Config_RNN.json','r') as file:
    paths = js.load(file)

loading padded captions

In [3]:
with open(paths["Padded_preprocessed_data"],'r') as f:
    padded_captions = js.load(f)

Build vocabulary and one hot encoding

In [11]:
# Build vocabulary from padded_captions
vocab = set(word for cap in padded_captions for word in cap)
word2idx = {word: idx + 1 for idx, word in enumerate(sorted(vocab))}  # +1 to reserve 0 for padding
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx) + 1  # +1 to include padding index

# Convert words to integer IDs (instead of one-hot directly)
integer_encoded_captions = []
for cap in padded_captions:
    encoded = [word2idx[word] for word in cap]
    integer_encoded_captions.append(encoded)

# Convert to numpy array
integer_encoded_captions = np.array(integer_encoded_captions)


Split into input/output sequence

In [12]:
input_seq_len = 158
output_seq_len = 30

input_tensor = integer_encoded_captions[:, :input_seq_len]
target_tensor = integer_encoded_captions[:, input_seq_len:input_seq_len + output_seq_len]


Embeddings

In [13]:
model = Word2Vec(sentences=padded_captions, vector_size=100, window=5, min_count=1, workers=4)
w2v_captions = []

for cap in padded_captions:
    encoder_inputs = Input(shape=(input_seq_len,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=158, mask_zero=True)(encoder_inputs)

    w2v_captions.append(encoded)

KeyboardInterrupt: 

Model

In [6]:
input_seq_len = 158
output_seq_len = 30
vector_dim = 158
hidden_units = 256 
encoder_inputs = Input(shape=(input_seq_len, vector_dim))
encoder = LSTM(hidden_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
decoder_inputs = tf.keras.layers.RepeatVector(output_seq_len)(state_h)
decoder_lstm = LSTM(hidden_units, return_sequences=True)
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=[state_h, state_c])
decoder_dense = TimeDistributed(Dense(vector_dim))
final_outputs = decoder_dense(decoder_outputs)
model = Model(inputs=encoder_inputs, outputs=final_outputs)
model.compile(optimizer='adam', loss='mse') 

Model Training

In [7]:
# Parameters
vocab_size = 10000
seq_length = 30
units = 512
BATCH_SIZE = 64
EPOCHS = 10
input_seq_len = 158
output_seq_len = 30
vector_dim = 158
hidden_units = 256  # You can change as per your model size

# ========== TEXT VECTORIZATION ==========
vectorizer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    pad_to_max_tokens=True
)
all_captions = ["<start> " + " ".join(cap) + " <end>" for cap in padded_captions]

vectorizer.adapt(all_captions)
# input_tensor = np.array([np.mean(cap, axis=0) for cap in w2v_captions])
w2v_captions = np.array(w2v_captions)
input_tensor = np.array(w2v_captions[:, :input_seq_len])  # shape: (N, 158, 158)
target_tensor = np.array(w2v_captions[:, input_seq_len:input_seq_len + output_seq_len])  # shape: (N, 30, 158)

# ========== DATA SPLIT ==========
split_index = int(0.8 * len(input_tensor))
train_dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor[:split_index], target_tensor[:split_index])
).shuffle(1000).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor[split_index:], target_tensor[split_index:])
).batch(BATCH_SIZE)

print("input_tensor shape:", input_tensor.shape)  # should be (N, 158, 158)
print("target_tensor shape:", target_tensor.shape)  # should be (N, 30)

# Input layer: sequence of 158 vectors of size 158


input_tensor shape: (591753, 2, 100)
target_tensor shape: (591753, 0, 100)


In [8]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=158, input_length=input_seq_len),
    LSTM(hidden_units, return_sequences=False),
    layers.RepeatVector(output_seq_len),
    LSTM(hidden_units, return_sequences=True),
    TimeDistributed(Dense(vector_dim))
])


# encoder_inputs = Input(shape=(input_seq_len, vector_dim))

# # Encoder LSTM
# encoder = LSTM(hidden_units, return_sequences=True, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# # We only need the final states to initialize the decoder
# decoder_inputs = layers.RepeatVector(output_seq_len)(state_h)

# # Decoder LSTM
# decoder_lstm = LSTM(hidden_units, return_sequences=True)
# decoder_outputs = decoder_lstm(decoder_inputs, initial_state=[state_h, state_c])

# # Output projection: make sure each time step outputs a 158-dim vector
# decoder_dense = TimeDistributed(Dense(vector_dim))
# final_outputs = decoder_dense(decoder_outputs)

# Build model
# model = Model(inputs=encoder_inputs, outputs=final_outputs)
model.compile(optimizer='adam', loss='mse')  # or 'categorical_crossentropy' if doing classification
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 158, 158)          1580000   
                                                                 
 lstm_2 (LSTM)               (None, 256)               424960    
                                                                 
 repeat_vector_1 (RepeatVect  (None, 30, 256)          0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 30, 256)           525312    
                                                                 
 time_distributed_1 (TimeDis  (None, 30, 158)          40606     
 tributed)                                                       
                                                                 
Total params: 2,570,878
Trainable params: 2,570,878
Non-

In [9]:
train_dataset

<BatchDataset element_spec=(TensorSpec(shape=(None, 2, 100), dtype=tf.float32, name=None), TensorSpec(shape=(None, 0, 100), dtype=tf.float32, name=None))>

In [10]:
model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Epoch 1/10


ValueError: in user code:

    File "c:\Users\YADAV_hmzx8cu\anaconda3\envs\deep_learning\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\YADAV_hmzx8cu\anaconda3\envs\deep_learning\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\YADAV_hmzx8cu\anaconda3\envs\deep_learning\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\YADAV_hmzx8cu\anaconda3\envs\deep_learning\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\YADAV_hmzx8cu\anaconda3\envs\deep_learning\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\YADAV_hmzx8cu\anaconda3\envs\deep_learning\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 158), found shape=(None, 2, 100)


In [None]:
# import os
# from keras import layers
# # ========== SETUP ==========
# vocab_size = 10000
# seq_length = 30
# embedding_dim = 100
# units = 512
# BATCH_SIZE = 64
# EPOCHS = 10
# # Sample input: padded_captions = [{'output': 'a man riding a bike'}, ...]
# # Sample input: w2v_captions = [np.array([...]), np.array([...]), ...]
# # ========== TEXT VECTORIZATION ==========
# vectorizer = layers.TextVectorization(
#     max_tokens=vocab_size,
#     output_mode='int',
#     output_sequence_length=seq_length,
#     standardize='lower_and_strip_punctuation',
#     split='whitespace',
#     pad_to_max_tokens=True
# )
# all_captions = ["<start> " + item['output'] + " <end>" for item in padded_captions]
# vectorizer.adapt(all_captions)

# input_tensor = np.array([np.mean(cap, axis=0) for cap in w2v_captions])
# target_tensor = vectorizer(tf.constant(all_captions)).numpy()

# # ========== DATA SPLIT ==========
# split_index = int(0.8 * len(input_tensor))
# train_dataset = tf.data.Dataset.from_tensor_slices(
#     (input_tensor[:split_index], target_tensor[:split_index])
# ).shuffle(1000).batch(BATCH_SIZE)

# val_dataset = tf.data.Dataset.from_tensor_slices(
#     (input_tensor[split_index:], target_tensor[split_index:])
# ).batch(BATCH_SIZE)

# # ========== MODEL SETUP ==========
# import tensorflow as tf
# from keras import layers, optimizers
# import numpy as np
# from tqdm import tqdm
# # Define the Decoder
# class RNN_Decoder(tf.keras.Model):
#     def __init__(self, embedding_dim, units, vocab_size):
#         super(RNN_Decoder, self).__init__()
#         self.units = units
#         self.embedding = layers.Embedding(vocab_size, embedding_dim)
#         self.gru = layers.GRU(self.units,
#                               return_sequences=True,
#                               return_state=True,
#                               recurrent_initializer='glorot_uniform')
#         self.fc1 = layers.Dense(self.units)
#         self.fc2 = layers.Dense(vocab_size)

#     def call(self, x, features, hidden):
#         x = self.embedding(x)
#         features = tf.expand_dims(features, 1)
#         features = tf.tile(features, [1, tf.shape(x)[1], 1])  # Match sequence length
#         x = tf.concat([features, x], axis=-1)
#         output, state = self.gru(x, initial_state=hidden)
#         x = self.fc1(output)
#         x = self.fc2(x)
#         return x, state, None

#     def reset_state(self, batch_size):
#         return tf.zeros((batch_size, self.units))

# # Loss and optimizer
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# def loss_function(real, pred):
#     mask = tf.math.not_equal(real, 0)
#     loss_ = loss_object(real, pred)
#     mask = tf.cast(mask, dtype=loss_.dtype)
#     loss_ *= mask
#     return tf.reduce_mean(loss_)

# @tf.function
# def train_step(img_tensor, target, decoder, optimizer):
#     loss = 0
#     batch_size = tf.shape(img_tensor)[0]
#     hidden = decoder.reset_state(batch_size)
#     dec_input = target[:, :-1]
#     real = target[:, 1:]

#     with tf.GradientTape() as tape:
#         predictions, _, _ = decoder(dec_input, img_tensor, hidden)
#         loss = loss_function(real, predictions)

#     trainable_variables = decoder.trainable_variables
#     gradients = tape.gradient(loss, trainable_variables)
#     optimizer.apply_gradients(zip(gradients, trainable_variables))

#     return loss

# @tf.function
# def validation_step(img_tensor, target, decoder):
#     loss = 0
#     batch_size = tf.shape(img_tensor)[0]
#     hidden = decoder.reset_state(batch_size)
#     dec_input = target[:, :-1]
#     real = target[:, 1:]

#     predictions, _, _ = decoder(dec_input, img_tensor, hidden)
#     loss = loss_function(real, predictions)

#     return loss

# def train_model(train_dataset, val_dataset, decoder, optimizer, epochs, save_path):
#     best_val_loss = float('inf')

#     for epoch in range(epochs):
#         print(f"\nEpoch {epoch+1}/{epochs}")
#         total_loss = 0
#         for img_tensor, target in tqdm(train_dataset, desc="Training"):
#             batch_loss = train_step(img_tensor, target, decoder, optimizer)
#             total_loss += batch_loss

#         print(f"Training Loss: {total_loss/len(train_dataset):.4f}")

#         total_val_loss = 0
#         for img_tensor, target in tqdm(val_dataset, desc="Validating"):
#             batch_val_loss = validation_step(img_tensor, target, decoder)
#             total_val_loss += batch_val_loss

#         val_loss = total_val_loss / len(val_dataset)
#         print(f"Validation Loss: {val_loss:.4f}")

#         if val_loss < best_val_loss:
#             print("Validation loss improved. Saving model.")
#             decoder.save_weights(save_path)
#             best_val_loss = val_loss
#         else:
#             print("Validation loss did not improve.")

In [None]:
# rnn = RNN_Decoder(embedding_dim, units, vocab_size)
# train_model(train_dataset, val_dataset, rnn, optimizers.Adam(), 10, paths["Trained_model_RNN"])