# **Step 1: Data Preparation**

In [None]:
# Create folders to keep things organized
!mkdir -p coco/images
!mkdir -p coco/annotations

# Download training images
!wget http://images.cocodataset.org/zips/train2014.zip -P coco/images/

# Download validation images
!wget http://images.cocodataset.org/zips/val2014.zip -P coco/images/

# Download captions (annotations)
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip -P coco/annotations/

--2025-08-12 16:47:50--  http://images.cocodataset.org/zips/train2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.29.158, 16.15.176.85, 3.5.25.143, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.29.158|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13510573713 (13G) [application/zip]
Saving to: ‘coco/images/train2014.zip’


2025-08-12 16:50:52 (70.8 MB/s) - ‘coco/images/train2014.zip’ saved [13510573713/13510573713]

--2025-08-12 16:50:52--  http://images.cocodataset.org/zips/val2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.134.33, 3.5.27.184, 3.5.30.204, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.134.33|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6645013297 (6.2G) [application/zip]
Saving to: ‘coco/images/val2014.zip’


2025-08-12 16:52:36 (61.1 MB/s) - ‘coco/images/val2014.zip’ saved [6645013297/6645013297]

--2025-08-12 1

In [None]:
# Unzip training images
!unzip -q coco/images/train2014.zip -d coco/images/

# Unzip validation images
!unzip -q coco/images/val2014.zip -d coco/images/

# Unzip annotation file
!unzip -q coco/annotations/annotations_trainval2014.zip -d coco/annotations/


replace coco/images/train2014/COCO_train2014_000000270070.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
A
A
A
replace coco/images/val2014/COCO_val2014_000000324670.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace coco/annotations/annotations/instances_train2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import os

train_dir = 'coco/images/train2014'
val_dir = 'coco/images/val2014'

print("Training images:", len(os.listdir(train_dir)))
print("Validation images:", len(os.listdir(val_dir)))


Training images: 82783
Validation images: 40504


In [None]:
#Load captions
import json
import os

# Path to the JSON file
caption_path = 'coco/annotations/annotations/captions_train2014.json'

# Open and load the file
with open(caption_path, 'r') as f:
    annotations = json.load(f)

# Create a dictionary: {image_id: [list_of_captions]}
captions_dict = {}

for item in annotations['annotations']:
    img_id = item['image_id']
    caption = item['caption']
    full_img_id = 'COCO_train2014_' + str(img_id).zfill(12) + '.jpg'

    if full_img_id not in captions_dict:
        captions_dict[full_img_id] = []

    captions_dict[full_img_id].append(caption)

# Show sample image ID and captions
for k, v in list(captions_dict.items())[:3]:
    print("Image:", k)
    for i, cap in enumerate(v):
        print(f"  Caption {i+1}:", cap)
    print()


Image: COCO_train2014_000000318556.jpg
  Caption 1: A very clean and well decorated empty bathroom
  Caption 2: A blue and white bathroom with butterfly themed wall tiles.
  Caption 3: A bathroom with a border of butterflies and blue paint on the walls above it.
  Caption 4: An angled view of a beautifully decorated bathroom.
  Caption 5: A clock that blends in with the wall hangs in a bathroom. 

Image: COCO_train2014_000000116100.jpg
  Caption 1: A panoramic view of a kitchen and all of its appliances.
  Caption 2: A panoramic photo of a kitchen and dining room
  Caption 3: A wide angle view of the kitchen work area
  Caption 4: multiple photos of a brown and white kitchen. 
  Caption 5: A kitchen that has a checkered patterned floor and white cabinets.

Image: COCO_train2014_000000379340.jpg
  Caption 1: A graffiti-ed stop sign across the street from a red car 
  Caption 2: A vandalized stop sign and a red beetle on the road
  Caption 3: A red stop sign with a Bush bumper sticker un

In [None]:
import string

def clean_caption(caption):
    caption = caption.lower()  # make lowercase
    caption = caption.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    caption = caption.split()  # split into words
    caption = [word for word in caption if word.isalpha()]  # remove non-alphabetic
    caption = ' '.join(caption)  # join back into sentence
    caption = 'startseq ' + caption + ' endseq'  # add start and end tokens
    return caption

for img_id, captions in captions_dict.items():
    cleaned_captions = []
    for caption in captions:
        cleaned = clean_caption(caption)
        cleaned_captions.append(cleaned)
    captions_dict[img_id] = cleaned_captions




In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Flatten all captions into a single list
all_captions = []
for caption_list in captions_dict.values():
    all_captions.extend(caption_list)

# Initialize tokenizer
tokenizer = Tokenizer(oov_token="<unk>")  # <unk> is for unknown words
tokenizer.fit_on_texts(all_captions)  # learn the word-index mapping




In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert all captions to sequences of integers
sequences = tokenizer.texts_to_sequences(all_captions)

# Find the maximum length of any caption
max_length = max(len(seq) for seq in sequences)
print(max_length)

# Pad the sequences with 0s at the end
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')




51


In [None]:
sample_caption = "startseq a dog playing in park endseq"
sequence = tokenizer.texts_to_sequences([sample_caption])
print("Sequence:", sequence)
padded = pad_sequences(sequence, maxlen=max_length, padding='post')
print("Padded Sequence:", padded)

Sequence: [[3, 2, 47, 57, 8, 139, 4]]
Padded Sequence: [[  3   2  47  57   8 139   4   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]]


In [None]:
import os
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input
import numpy as np
from tqdm import tqdm  # for progress bar

def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(299, 299))       # Resize
    img_array = image.img_to_array(img)                           # Convert to array
    img_array = np.expand_dims(img_array, axis=0)                 # Add batch dimension
    img_array = preprocess_input(img_array)                       # Normalize
    return img_array

# Folder containing all images
images_folder = "/content/coco/images/train2014"

# List of all image filenames (you can limit for testing)
image_files = os.listdir(images_folder)

# Dictionary to store preprocessed images
preprocessed_images = {}

# Loop through all image files
for img_name in tqdm(image_files[:1000]):  # Limit to 1000 for now (optional)
    img_path = os.path.join(images_folder, img_name)
    try:
        preprocessed_images[img_name] = preprocess_image(img_path)
    except Exception as e:
        print(f"Error processing {img_name}: {e}")


100%|██████████| 1000/1000 [00:10<00:00, 97.37it/s]


In [None]:
img_path = '/content/coco/images/train2014/COCO_train2014_000000000009.jpg'
preprocessed_img = preprocess_image(img_path)
print("Shape:", preprocessed_img.shape)


Shape: (1, 299, 299, 3)


#**Step-2: Feature italicized text Extraction**

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
import numpy as np
import os
from tqdm import tqdm
from PIL import Image

# 1️⃣ Load the InceptionV3 model pre-trained on ImageNet
base_model = InceptionV3(weights='imagenet')

# 2️⃣ Remove the last classification layer
model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
[1m96112376/96112376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [None]:
def preprocess_image(img_path):
    # Load image with target size
    img = Image.open(img_path).resize((299, 299))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    # Preprocess for InceptionV3
    img_array = preprocess_input(img_array)
    return img_array


In [None]:
def extract_features(img_dir, limit=None):
    features = {}
    img_names = os.listdir(img_dir)

    if limit:
        img_names = img_names[:limit]  # only take the first `limit` images

    for img_name in tqdm(img_names):
        img_path = os.path.join(img_dir, img_name)
        try:
            img_array = preprocess_image(img_path)
            feature_vector = model.predict(img_array, verbose=0)
            features[img_name] = feature_vector.flatten()
        except Exception as e:
            print(f"Error processing {img_name}: {e}")

    return features

# Example usage:
image_dir = "/content/coco/images/train2014"
image_features = extract_features(image_dir, limit=500)  # process only first 500 images


 36%|███▋      | 182/500 [01:15<01:30,  3.53it/s]

Error processing COCO_train2014_000000578250.jpg: Graph execution error:

Detected at node convolution defined at (most recent call last):
<stack traces unavailable>
Depth of input must be a multiple of depth of filter: 1 vs 3

Stack trace for op definition: 
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
File "/usr/local/lib/python3.11/dist-

 40%|████      | 201/500 [01:23<01:28,  3.37it/s]

Error processing COCO_train2014_000000066642.jpg: Graph execution error:

Detected at node convolution defined at (most recent call last):
<stack traces unavailable>
Depth of input must be a multiple of depth of filter: 1 vs 3

Stack trace for op definition: 
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
File "/usr/local/lib/python3.11/dist-

100%|██████████| 500/500 [03:27<00:00,  2.41it/s]


In [None]:
import pickle
with open("image_features.pkl", "wb") as f:
    pickle.dump(image_features, f)
print(f"Saved features for {len(image_features)} images.")


Saved features for 498 images.


# **Step-3: Preparing Captions for Training**

In [None]:
import json
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# ===== 1️⃣ Load captions =====
captions_path = "/content/coco/annotations/annotations/captions_train2014.json"
with open(captions_path, 'r') as f:
    captions_data = json.load(f)

# ===== 2️⃣ Clean caption =====
def clean_caption(caption):
    caption = caption.lower()
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = ' '.join([word for word in caption.split() if word.isalpha()])
    return caption

# ===== 3️⃣ Build captions_dict with cleaning + special tokens =====
captions_dict = {}
for ann in captions_data['annotations']:
    img_id = ann['image_id']
    cleaned = clean_caption(ann['caption'])
    caption = f"<start> {cleaned} <end>"
    img_filename = f"COCO_train2014_{img_id:012d}.jpg"
    captions_dict.setdefault(img_filename, []).append(caption)

print(f"Loaded captions for {len(captions_dict)} images.")

# ===== 4️⃣ Tokenize =====
all_captions = [cap for caps in captions_dict.values() for cap in caps]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

# Convert captions to integer sequences
for img in captions_dict:
    captions_dict[img] = tokenizer.texts_to_sequences(captions_dict[img])

# ===== 5️⃣ Max caption length =====
max_length = max(len(seq) for caps in captions_dict.values() for seq in caps)
print("Max caption length:", max_length)

# ===== 6️⃣ Prepare training data =====
def create_sequences(tokenizer, max_length, captions_dict, image_features):
    X1, X2, y = [], [], []
    for img, caps in captions_dict.items():
        if img not in image_features:  # skip images without features
            continue
        feature = image_features[img]
        for seq in caps:
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Example usage (make sure image_features is ready from Step 2)
X1, X2, y = create_sequences(tokenizer, max_length, captions_dict, image_features)


Loaded captions for 82783 images.
Vocabulary size: 24383
Max caption length: 51


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Step 4: Building the Model**

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Add
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Attention
from tensorflow.keras.layers import Lambda

# ===== Encoder =====
def build_model(vocab_size, max_length, embedding_dim=256, units=256):
    # Image feature input (from InceptionV3 output in Step 2)
    inputs1 = Input(shape=(2048,))
    fe1 = Dense(units, activation='relu')(inputs1)  # transform feature vector
    fe2 = Dropout(0.5)(fe1)

    # Sequence input (caption tokens)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(units, return_sequences=True)(se2)

    # ===== Attention mechanism =====
    # Expand image features to sequence length for attention
    fe2_expanded = Lambda(lambda x: tf.expand_dims(x, 1))(fe2)

    attention_out = Attention()([se3, fe2_expanded])  # context vector from image features

    # Combine attention output with sequence features
    decoder_combined = Add()([se3, attention_out])
    decoder_lstm = LSTM(units)(decoder_combined)

    # ===== Output layer =====
    outputs = Dense(vocab_size, activation='softmax')(decoder_lstm)

    # ===== Build & compile model =====
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    return model

# Example usage:
model = build_model(vocab_size, max_length)
# model.summary()


# **Step 5: Training the Model**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming you already have:
# X1 → image features (shape: num_samples × 2048)
# X2 → caption input sequences (shape: num_samples × max_length)
# y  → next word indices (shape: num_samples × 1)

# 1️⃣ Split into train and validation
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

# 2️⃣ Build model (from Step 4)
model = build_model(vocab_size, max_length, embedding_dim=256, units=256)

# 3️⃣ Compile with Adam optimizer
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 4️⃣ Train model
history = model.fit(
    [X1_train, X2_train], y_train,
    epochs=20,
    batch_size=64,
    validation_data=([X1_val, X2_val], y_val),
    verbose=1
)

# Optional: Save trained model
model.save("image_caption_model.h5")



Epoch 1/20




[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 1s/step - accuracy: 0.1400 - loss: 6.5909 - val_accuracy: 0.1921 - val_loss: 5.4685
Epoch 2/20
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 1s/step - accuracy: 0.2127 - loss: 5.0368 - val_accuracy: 0.2676 - val_loss: 4.7875
Epoch 3/20
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 1s/step - accuracy: 0.2744 - loss: 4.3977 - val_accuracy: 0.2891 - val_loss: 4.5450
Epoch 4/20
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m430s[0m 1s/step - accuracy: 0.2968 - loss: 4.0691 - val_accuracy: 0.3090 - val_loss: 4.4546
Epoch 5/20
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 1s/step - accuracy: 0.3187 - loss: 3.8285 - val_accuracy: 0.3228 - val_loss: 4.3882
Epoch 6/20
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m408s[0m 1s/step - accuracy: 0.3285 - loss: 3.6381 - val_accuracy: 0.3317 - val_loss: 4.3102
Epoch 7/20
[1m358/358[0m [32m━



# **Step 6: Model Evaluation**

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Example reference captions for 2 images
y_true = [
    [["a", "cat", "on", "a", "mat"], ["there", "is", "a", "cat", "on", "the", "mat"]],
    [["a", "man", "riding", "a", "bike"], ["a", "person", "on", "a", "bicycle"]]
]

# Example generated captions for 2 images
y_pred = [
    ["a", "cat", "on", "a", "mat"],
    ["a", "man", "riding", "a", "bicycle"]
]

# Calculate BLEU scores
bleu1 = corpus_bleu(y_true, y_pred, weights=(1.0, 0, 0, 0))
bleu2 = corpus_bleu(y_true, y_pred, weights=(0.5, 0.5, 0, 0))
bleu3 = corpus_bleu(y_true, y_pred, weights=(0.33, 0.33, 0.33, 0))
bleu4 = corpus_bleu(y_true, y_pred, weights=(0.25, 0.25, 0.25, 0.25))

print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-2: {bleu2:.4f}")
print(f"BLEU-3: {bleu3:.4f}")
print(f"BLEU-4: {bleu4:.4f}")


BLEU-1: 1.0000
BLEU-2: 1.0000
BLEU-3: 0.9416
BLEU-4: 0.8891


# Step 7: Fine-tuning

In [None]:
# Step 7: Fine-tuning (instructions + code snippets)
# If you decide to fine-tune CNN: re-create InceptionV3 with include_top=False and unfreeze top layers.
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model

# Example snippet (do NOT run unless you want to re-extract or fine-tune)
base = InceptionV3(weights='imagenet', include_top=False)  # conv base only
# unfreeze last N layers:
for layer in base.layers[:-50]:
    layer.trainable = False
for layer in base.layers[-50:]:
    layer.trainable = True

# attach a small head if you want to fine-tune end-to-end; typically you re-run feature extraction after small-finetune
print('Prepared InceptionV3 for fine-tuning (last 50 layers trainable).')


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Prepared InceptionV3 for fine-tuning (last 50 layers trainable).
