In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, AdditiveAttention
from tensorflow.keras.models import Model
import pickle
import numpy as np

# Load precomputed image features
with open("features_inception.pkl", "rb") as f:
    image_features = pickle.load(f)

# Define model parameters
embedding_dim = 256
units = 512
vocab_size = 5000  # This should match your tokenizer vocabulary size

In [2]:
# CNN Encoder
class CNN_Encoder(Model):
    def __init__(self, embed_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = Dense(embed_dim, activation="relu")
    
    def call(self, x):
        x = self.fc(x)
        return x

In [3]:
# RNN Decoder with Attention
class RNN_Decoder(Model):
    def __init__(self, vocab_size, embed_dim, units):
        super(RNN_Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.lstm = LSTM(units, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size)
        self.attention = AdditiveAttention()
    
    def call(self, x, hidden, features):
        context_vector, _ = self.attention([tf.expand_dims(hidden, 1), features], return_attention_scores=True)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x)
        x = self.fc(output)
        return x, state_h, state_c

# Instantiate models
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(vocab_size, embedding_dim, units)

print("Model structure defined successfully.")


Model structure defined successfully.
