In [None]:
import json
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Step 1: Load the processed image-caption data
with open("image_captions.json", "r") as f:
    data = json.load(f)

# Step 2: Clean the captions
def clean_caption(caption):
    caption = caption.lower()
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    caption = caption.strip()
    return caption

# Build cleaned caption dictionary
caption_dict = {}
for item in data:
    captions = [clean_caption(c) for c in item['captions']]
    caption_dict[item['file_name']] = captions

# Step 3: Prepare all captions for Tokenizer
all_captions = []
for captions in caption_dict.values():
    for caption in captions:
        all_captions.append(f'startseq {caption} endseq')  # Add start/end tokens

# Step 4: Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Step 5: Max length for padding
max_length = max(len(c.split()) for c in all_captions)

# Optional: Save tokenizer for later use
with open("tokenizer.json", "w") as f:
    f.write(tokenizer.to_json())

print("Vocabulary Size:", vocab_size)
print("Max Caption Length:", max_length)
