### Lab 30: Attention Model for Image Captioning

Objective: Implement an image captioning system with Bahdanau attention



In [None]:

!pip install tensorflow pillow tqdm --quiet

import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os
import pickle
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

# -----------------
# 1. Dataset Setup
# -----------------
# NOTE: In real usage, download Flickr8k dataset and extract to ./Flickr8k/
# Here we simulate with a small set due to time/memory limits.

# For demonstration, create a dummy mapping of image -> captions
image_paths = ["sample1.jpg", "sample2.jpg"]  # Replace with actual paths
captions = {
    "sample1.jpg": ["a dog running in the grass", "a brown dog is playing outdoors"],
    "sample2.jpg": ["a man riding a bicycle", "a cyclist on a road"]
}

# -----------------
# 2. Image Preprocessing
# -----------------
def load_image(path):
    img = Image.open(path).resize((299, 299))  # InceptionV3 size
    img = np.array(img) / 255.0
    return img

# -----------------
# 3. Tokenizer Setup
# -----------------
all_captions = []
for cap_list in captions.values():
    all_captions.extend(cap_list)

tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(c.split()) for c in all_captions)

# -----------------
# 4. CNN Encoder (Pre-trained)
# -----------------
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output
encoder_model = tf.keras.Model(new_input, hidden_layer)

class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

# -----------------
# 5. Bahdanau Attention
# -----------------
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# -----------------
# 6. RNN Decoder with Attention
# -----------------
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(self.units,
                              return_sequences=True,
                              return_state=True,
                              recurrent_initializer='glorot_uniform')
        self.fc1 = layers.Dense(self.units)
        self.fc2 = layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        context_vector, attention_weights = self.attention(features, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        x = self.fc1(output)
        x = tf.reshape(x, (-1, x.shape[2]))
        x = self.fc2(x)
        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

# -----------------
# 7. Instantiate Encoder-Decoder
# -----------------
embedding_dim = 256
units = 512

encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

# -----------------
# 8. Dummy Feature Extraction
# -----------------
# Normally, extract features for each image
dummy_img = np.random.rand(1, 8, 8, 2048).astype(np.float32)  # Simulated InceptionV3 output
features = encoder(dummy_img)

# -----------------
# 9. Dummy Caption Generation
# -----------------
hidden = decoder.reset_state(batch_size=1)
dec_input = tf.expand_dims([tokenizer.word_index['<unk>']], 0)  # Start token
result = []
for i in range(max_length):
    predictions, hidden, attention_weights = decoder(dec_input, features, hidden)
    predicted_id = tf.argmax(predictions[0]).numpy()
    result.append(predicted_id)
    if predicted_id == 0:  # Suppose 0 is <end>
        break
    dec_input = tf.expand_dims([predicted_id], 0)

print("Generated Token IDs:", result)

# -----------------
# 10. Visualization of Attention (Dummy)
# -----------------
# Skipped actual visualization since we used dummy data
print("Attention model setup complete. Ready for real dataset training.")
