In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Add, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from tqdm import tqdm
import os
import time 


In [3]:
# Load preprocessed data
with open("E:/ICS/word_mappings.pkl", "rb") as f:
    word_mappings = pickle.load(f)
    word2idx = word_mappings['word2idx']
    idx2word = word_mappings['idx2word']
    max_length = word_mappings['max_length']

with open("E:/ICS/features/train_features.pkl", "rb") as f:
    train_features = pickle.load(f)

with open("E:/ICS/features/val_features.pkl", "rb") as f:
    val_features = pickle.load(f)

with open("E:/ICS/cleaned_captions.pkl", "rb") as f:
    cleaned_captions = pickle.load(f)

# Utility: Convert words to sequence
def caption_to_seq(caption):
    return [word2idx.get(w, word2idx['<unk>']) for w in caption.split()]

In [5]:
# Prepare training and validation data
train_data = []
val_data = []

for img, features in train_features.items():
    for cap in cleaned_captions.get(img, []):
        seq = caption_to_seq(cap)
        train_data.append((features, seq))

for img, features in val_features.items():
    for cap in cleaned_captions.get(img, []):
        seq = caption_to_seq(cap)
        val_data.append((features, seq))

# Pad sequences
def create_dataset(data, batch_size):
    img_feats = []
    cap_seqs = []
    for feature, seq in data:
        img_feats.append(feature)
        cap_seqs.append(seq)
    cap_seqs = pad_sequences(cap_seqs, maxlen=max_length, padding='post')
    return tf.data.Dataset.from_tensor_slices((np.array(img_feats), cap_seqs)).shuffle(1000).batch(batch_size)

batch_size = 64
train_dataset = create_dataset(train_data, batch_size)
val_dataset = create_dataset(val_data, batch_size)


In [6]:
# Define the Encoder
def build_encoder(embedding_dim):
    inputs = Input(shape=(2048,), name='image_input')
    fc = Dense(embedding_dim, activation='relu', name='encoder_fc')(inputs)
    return Model(inputs, fc, name='CNN_Encoder')

# Define the Decoder with Attention
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context = attention_weights * features
        context = tf.reduce_sum(context, axis=1)
        return context, attention_weights

class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super().__init__()
        self.units = units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(units, return_sequences=True, return_state=True)
        self.fc1 = Dense(units)
        self.fc2 = Dense(vocab_size)
        self.attention = BahdanauAttention(units)

    def call(self, x, features, hidden):
        context_vector, _ = self.attention(tf.expand_dims(features, 1), hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, _ = self.lstm(x)
        x = self.fc1(output)
        x = self.fc2(x)
        return x, state_h

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [7]:
# Initialize model
embedding_dim = 256
units = 512
vocab_size = len(word2idx)

encoder = build_encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

optimizer = Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    return tf.reduce_mean(loss * mask)

In [8]:
# Training Step
def train_step(img_tensor, target):
    loss = 0
    hidden = decoder.reset_state(batch_size=target.shape[0])

    with tf.GradientTape() as tape:
        features = encoder(img_tensor)
        for i in range(1, target.shape[1]):
            dec_input = tf.expand_dims(target[:, i - 1], 1)
            predictions, hidden = decoder(dec_input, features, hidden)
            loss += loss_function(target[:, i], predictions[:, 0, :])
            train_accuracy.update_state(target[:, i], predictions[:, 0, :])

    total_loss = loss / int(target.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return total_loss

In [9]:
# Training Loop
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

epochs = 20
best_val_loss = float('inf')
patience = 5
wait = 0

for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    total_val_loss = 0
    print(f"\nEpoch {epoch+1}/{epochs}")

    train_loop = tqdm(train_dataset, desc="Training", ncols=100)
    for (batch, (img_tensor, target)) in enumerate(train_loop):
        batch_loss = train_step(img_tensor, target)
        total_loss += batch_loss
        train_loop.set_postfix(loss=batch_loss.numpy())

    for img_tensor, target in val_dataset:
        hidden = decoder.reset_state(batch_size=target.shape[0])
        features = encoder(img_tensor)
        loss = 0
        for i in range(1, target.shape[1]):
            dec_input = tf.expand_dims(target[:, i - 1], 1)
            predictions, hidden = decoder(dec_input, features, hidden)
            loss += loss_function(target[:, i], predictions[:, 0, :])
            val_accuracy.update_state(target[:, i], predictions[:, 0, :])
        total_val_loss += loss / int(target.shape[1])

    avg_train_loss = total_loss / len(train_dataset)
    avg_val_loss = total_val_loss / len(val_dataset)
    train_acc = train_accuracy.result()
    val_acc = val_accuracy.result()

    print(f"Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Accuracy: {train_acc:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_acc:.4f}, Time: {time.time()-start:.2f}s")

    train_accuracy.reset_state()
    val_accuracy.reset_state()

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        wait = 0
        encoder.save("E:/ICS/saved_models/encoder_best.keras")
        decoder.save("E:/ICS/saved_models/decoder_best.keras")
        print(" Best model saved!")
    else:
        wait += 1
        if wait >= patience:
            print("\nEarly stopping triggered.")
            break

print("\n Training complete. Final model saved.")



Epoch 1/20


Training: 100%|██████████████████████████████████████| 455/455 [1:20:50<00:00, 10.66s/it, loss=1.15]


Epoch 1, Loss: 1.3718, Accuracy: 0.0779, Val Loss: 1.2121, Val Accuracy: 0.0989, Time: 4994.78s
 Best model saved!

Epoch 2/20


Training: 100%|██████████████████████████████████████| 455/455 [1:21:39<00:00, 10.77s/it, loss=1.01]


Epoch 2, Loss: 1.0923, Accuracy: 0.1046, Val Loss: 1.1248, Val Accuracy: 0.1048, Time: 5045.57s
 Best model saved!

Epoch 3/20


Training: 100%|██████████████████████████████████████| 455/455 [1:19:28<00:00, 10.48s/it, loss=1.04]


Epoch 3, Loss: 1.0013, Accuracy: 0.1110, Val Loss: 1.1000, Val Accuracy: 0.1066, Time: 4912.32s
 Best model saved!

Epoch 4/20


Training: 100%|█████████████████████████████████████| 455/455 [1:20:41<00:00, 10.64s/it, loss=0.938]


Epoch 4, Loss: 0.9446, Accuracy: 0.1151, Val Loss: 1.1008, Val Accuracy: 0.1059, Time: 4985.17s

Epoch 5/20


Training: 100%|█████████████████████████████████████| 455/455 [1:20:03<00:00, 10.56s/it, loss=0.817]


Epoch 5, Loss: 0.8997, Accuracy: 0.1188, Val Loss: 1.1064, Val Accuracy: 0.1073, Time: 4946.77s

Epoch 6/20


Training: 100%|█████████████████████████████████████| 455/455 [1:21:38<00:00, 10.77s/it, loss=0.817]


Epoch 6, Loss: 0.8614, Accuracy: 0.1223, Val Loss: 1.1154, Val Accuracy: 0.1068, Time: 5042.15s

Epoch 7/20


Training: 100%|█████████████████████████████████████| 455/455 [1:18:39<00:00, 10.37s/it, loss=0.873]


Epoch 7, Loss: 0.8254, Accuracy: 0.1258, Val Loss: 1.1283, Val Accuracy: 0.1067, Time: 4859.65s

Epoch 8/20


Training: 100%|█████████████████████████████████████| 455/455 [1:23:29<00:00, 11.01s/it, loss=0.835]


Epoch 8, Loss: 0.7919, Accuracy: 0.1294, Val Loss: 1.1408, Val Accuracy: 0.1055, Time: 5149.90s

Early stopping triggered.

 Training complete. Final model saved.
