In [9]:
import re
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import pickle
import os

# Function to clean captions
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Apply cleaning to all captions
cleaned_captions_dict = {}

with open("captions.pkl", "rb") as f:
    captions_dict = pickle.load(f)



for img, captions in captions_dict.items():
    cleaned_captions_dict[img] = ["<start> " + clean_text(cap) + " <end>" for cap in captions]

# Print example
for img, caps in list(cleaned_captions_dict.items())[:3]:
    print(f"Image: {img}")
    for cap in caps:
        print(f"- {cap}")
    print("\n")  # Add spacing


Image: 1000268201_693b08cb0e.jpg
- <start> a child in a pink dress is climbing up a set of stairs in an entry way <end>
- <start> a girl going into a wooden building <end>
- <start> a little girl climbing into a wooden playhouse <end>
- <start> a little girl climbing the stairs to her playhouse <end>
- <start> a little girl in a pink dress going into a wooden cabin <end>


Image: 1001773457_577c3a7d70.jpg
- <start> a black dog and a spotted dog are fighting <end>
- <start> a black dog and a tricolored dog playing with each other on the road <end>
- <start> a black dog and a white dog with brown spots are staring at each other in the street <end>
- <start> two dogs of different breeds looking at each other on the road <end>
- <start> two dogs on pavement moving toward each other <end>


Image: 1002674143_1b742ab4b8.jpg
- <start> a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl <end>
- <start> a little girl is sitting in front of a large painted 

# Building the Vocabulary

In [10]:
from collections import Counter

# Collect all words from captions
all_words = []
for captions in cleaned_captions_dict.values():
    for caption in captions:
        all_words.extend(caption.split())

# Count word frequencies
word_counts = Counter(all_words)

# Create a vocabulary (word2idx and idx2word)
vocab = [word for word, count in word_counts.items() if count >= 2]  # Keep words appearing at least twice
vocab.insert(0, "<pad>")  # Padding token
vocab.insert(1, "<unk>")  # Unknown token

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

# Vocabulary size
vocab_size = len(vocab)

# Print vocabulary size and sample mappings
print(f"Vocabulary Size: {vocab_size}")
print("Sample word2idx mappings:", list(word2idx.items())[:10])


Vocabulary Size: 5202
Sample word2idx mappings: [('<pad>', 0), ('<unk>', 1), ('<start>', 2), ('a', 3), ('child', 4), ('in', 5), ('pink', 6), ('dress', 7), ('is', 8), ('climbing', 9)]


# Convert Captions to Sequences
**Tokenaization**

In [11]:
# Convert captions into sequences of word indices
captions_sequences = {}

for img, captions in cleaned_captions_dict.items():
    sequences = []
    for caption in captions:
        seq = [word2idx.get(word, word2idx["<unk>"]) for word in caption.split()]  # Convert words to indices
        sequences.append(seq)
    captions_sequences[img] = sequences

# Print some sample sequences
for img, seqs in list(captions_sequences.items())[:3]:  # Show first 3 images
    print(f"Image: {img}")
    for seq in seqs:
        print(seq)
    print("\n")

Image: 1000268201_693b08cb0e.jpg
[2, 3, 4, 5, 3, 6, 7, 8, 9, 10, 3, 11, 12, 13, 5, 14, 1, 15, 16]
[2, 3, 17, 18, 19, 3, 20, 21, 16]
[2, 3, 22, 17, 9, 19, 3, 20, 23, 16]
[2, 3, 22, 17, 9, 24, 13, 25, 26, 23, 16]
[2, 3, 22, 17, 5, 3, 6, 7, 18, 19, 3, 20, 27, 16]


Image: 1001773457_577c3a7d70.jpg
[2, 3, 28, 29, 30, 3, 31, 29, 32, 33, 16]
[2, 3, 28, 29, 30, 3, 34, 29, 35, 36, 37, 38, 39, 24, 40, 16]
[2, 3, 28, 29, 30, 3, 41, 29, 36, 42, 43, 32, 44, 45, 37, 38, 5, 24, 46, 16]
[2, 47, 48, 12, 49, 50, 51, 45, 37, 38, 39, 24, 40, 16]
[2, 47, 48, 39, 52, 53, 54, 37, 38, 16]


Image: 1002674143_1b742ab4b8.jpg
[2, 3, 22, 17, 55, 5, 56, 57, 5, 58, 12, 3, 59, 60, 36, 26, 61, 5, 3, 62, 16]
[2, 3, 22, 17, 8, 63, 5, 58, 12, 3, 64, 59, 60, 16]
[2, 3, 65, 17, 5, 24, 66, 67, 36, 68, 5, 58, 12, 3, 41, 69, 36, 3, 60, 39, 70, 16]
[2, 71, 8, 3, 17, 36, 72, 63, 5, 58, 12, 3, 60, 73, 16]
[2, 74, 17, 36, 72, 73, 75, 5, 24, 66, 16]




# Padding Sequences

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Find maximum caption length
max_caption_length = max(len(seq) for seqs in captions_sequences.values() for seq in seqs)
print(f"Max caption length: {max_caption_length}")

# Pad sequences
padded_sequences = {}
for img, seqs in captions_sequences.items():
    padded_sequences[img] = pad_sequences(seqs, maxlen=max_caption_length, padding='post')

# Print some padded sequences
for img, seqs in list(padded_sequences.items())[:3]:  # Show first 3 images
    print(f"Image: {img}")
    for seq in seqs:
        print(seq)
    print("\n")


Max caption length: 37
Image: 1000268201_693b08cb0e.jpg
[ 2  3  4  5  3  6  7  8  9 10  3 11 12 13  5 14  1 15 16  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 2  3 17 18 19  3 20 21 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 2  3 22 17  9 19  3 20 23 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 2  3 22 17  9 24 13 25 26 23 16  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 2  3 22 17  5  3  6  7 18 19  3 20 27 16  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]


Image: 1001773457_577c3a7d70.jpg
[ 2  3 28 29 30  3 31 29 32 33 16  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 2  3 28 29 30  3 34 29 35 36 37 38 39 24 40 16  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 2  3 28 29 30  3 41 29 36 42 43 32 44 45 37 38  5 24 46 16  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  