## Image Captioning And Data Preparation

In [None]:
# Week 3-4 Data Preparation for Image Captioning (Flickr8k)
# Notebook-Friendly Version (no CLI args)

import os
import pickle
import json
import re
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

import nltk
nltk.download('punkt')

from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set Your Paths Here 
images_dir = "./images"
captions_file = "./captions/Flickr8k.token.txt"
features_dir = "./features"
output_dir = "./processed"
sample = 0

os.makedirs(features_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Load Raw Captions

def load_raw_captions(captions_file):
    descriptions = defaultdict(list)
    with open(captions_file, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            img_cap, caption = line.strip().split('\t')
            img_id = img_cap.split('#')[0]
            descriptions[img_id].append(caption)
    return descriptions

print("Loading captions...")
descriptions = load_raw_captions(captions_file)
print(f"Loaded captions for {len(descriptions)} images")

if sample > 0:
    keys = sorted(list(descriptions.keys()))[:sample]
    descriptions = {k: descriptions[k] for k in keys}
    print(f"Using sample of {len(descriptions)} images")

# Clean Captions 

def clean_caption_text(caption):
    caption = caption.lower()
    caption = re.sub(r"[^a-z0-9\s]", '', caption)
    caption = re.sub(r"\s+", ' ', caption).strip()
    return '<start> ' + caption + ' <end>'

cleaned = {img: [clean_caption_text(c) for c in caps] for img, caps in descriptions.items()}

# Save cleaned captions to CSV
rows = [(img, c) for img, caps in cleaned.items() for c in caps]
df = pd.DataFrame(rows, columns=['image', 'caption'])
cleaned_csv_path = os.path.join(output_dir, 'cleaned_captions.csv')
df.to_csv(cleaned_csv_path, index=False)
print(f"Cleaned captions saved to {cleaned_csv_path}")

# Tokenizer 
all_captions = [c for caps in cleaned.values() for c in caps]
tokenizer = Tokenizer(oov_token='<unk>', filters='')
tokenizer.fit_on_texts(all_captions)

vocab_size = len(tokenizer.word_index) + 1
tokenizer_path = os.path.join(output_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved. Vocab size: {vocab_size}")

# Max Caption Length 
max_length = max(len(c.split()) for c in all_captions)
print(f"Max caption length: {max_length}")

# Extract Image Features 
model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')

def extract_and_save_features(images_dir, features_dir, image_list=None):
    if image_list is None:
        files = [f for f in os.listdir(images_dir) if f.lower().endswith(('jpg', 'jpeg', 'png'))]
    else:
        files = [f for f in image_list if os.path.exists(os.path.join(images_dir, f))]
    files.sort()
    if sample > 0:
        files = files[:sample]
    for fname in tqdm(files, desc='Extracting features'):
        img_path = os.path.join(images_dir, fname)
        try:
            img = keras_image.load_img(img_path, target_size=(299, 299))
            x = keras_image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            feature = model.predict(x, verbose=0)
            feature_path = os.path.join(features_dir, fname + '.npy')
            np.save(feature_path, feature)
        except Exception as e:
            print(f"Error processing {fname}: {e}")

extract_and_save_features(images_dir, features_dir, list(cleaned.keys()))
print(f"Image features saved to {features_dir}")

# Save Metadata
metadata = {
    'vocab_size': vocab_size,
    'max_length': max_length,
    'num_images': len(cleaned)
}
meta_path = os.path.join(output_dir, 'metadata.json')
with open(meta_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"Metadata saved to {meta_path}")

print("\nData preparation completed. Ready for Week 5-6 model training.")

processed_images = list(cleaned.keys())
pkl_path = os.path.join(output_dir, 'processed_images.pkl')
with open(pkl_path, 'wb') as f:
    pickle.dump(processed_images, f)

print(f"Processed image list saved to {pkl_path}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bollejayanthsriteja\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


Loading captions...
Loaded captions for 8092 images
Cleaned captions saved to ./processed\cleaned_captions.csv
Tokenizer saved. Vocab size: 8832
Max caption length: 38


Extracting features: 100%|██████████| 8091/8091 [35:58<00:00,  3.75it/s]  

Image features saved to ./features
Metadata saved to ./processed\metadata.json

Data preparation completed. Ready for Week 5-6 model training.
Processed image list saved to ./processed\processed_images.pkl



