# Name - Parikshit Sahu
## Codsoft Task 3

In [38]:
import os
import json
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Add, Dropout, Input
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Step 1: Load and preprocess the dataset
dataset_image_dir = r'C:\Users\sahup\dataset_images'  # Replace with your image folder path
captions_file = 'captions.json'  # Assuming you have captions.json file in the current directory

# Load image features (dummy example with random features for now)
def load_image_features(image_dir):
    features = {}
    for img_name in os.listdir(image_dir):
        if img_name.endswith('.jpg') or img_name.endswith('.png'):
            image_id = os.path.splitext(img_name)[0]  # Extract file name without extension
            features[image_id] = np.random.rand(256)  # Dummy feature vector (replace with actual feature extraction)
    return features

# Load the captions data
def load_captions(captions_path):
    with open(captions_path, 'r') as file:
        captions_data = json.load(file)
    
    # Debug: Check the structure of the loaded captions data
    print("Captions data loaded. Previewing first few entries:")
    for image_id, captions in list(captions_data.items())[:5]:  # Preview first 5 entries
        print(f"Image ID: {image_id}, Captions: {captions}")
        
    return captions_data

# Load the features and captions
image_features = load_image_features(dataset_image_dir)
captions_data = load_captions(captions_file)

# Debug: Check the keys in image_features and captions_data
print(f"Loaded {len(image_features)} image features")
print(f"Loaded {len(captions_data)} captions data entries")

# Debug: Show the image features and captions keys to check for matching IDs
print("Image IDs in image_features:", list(image_features.keys())[:5])
print("Image IDs in captions_data:", list(captions_data.keys())[:5])

# Step 2: Tokenize the captions
def tokenize_captions(captions_data):
    all_captions = []
    for img, captions in captions_data.items():
        for caption in captions:
            if isinstance(caption, str):  # Ensure the caption is a string
                all_captions.append(caption)
            else:
                print(f"Skipping non-string caption: {caption}")
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    return tokenizer

# Tokenize captions and define max caption length
tokenizer = tokenize_captions(captions_data)

# Step 2.1: Filter out non-string captions before calculating max_caption_length
def get_max_caption_length(captions_data):
    # Filter only string captions
    all_captions = []
    for captions in captions_data.values():
        for caption in captions:
            if isinstance(caption, str):
                all_captions.append(caption)
    
    # Get the length of the longest caption
    return max([len(caption.split()) for caption in all_captions])

# Calculate max_caption_length using filtered valid string captions
max_caption_length = get_max_caption_length(captions_data)
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token

print(f"Max caption length: {max_caption_length}")
print(f"Vocabulary size: {vocab_size}")

# Step 3: Prepare the data
def prepare_training_data(image_features, captions_data, tokenizer, max_caption_length):
    X_image = []
    X_caption = []
    y = []

    for img, captions in captions_data.items():
        # Remove the file extension if it exists
        image_id = os.path.splitext(img)[0]  # Extract image name without extension
        if image_id in image_features:  # Ensure that the image id exists in image_features
            for caption in captions:
                # Ensure caption is a string before processing
                if isinstance(caption, str):
                    # Prepare image features (X_image)
                    X_image.append(image_features[image_id])

                    # Tokenize the caption (X_caption)
                    caption_seq = tokenizer.texts_to_sequences([caption])[0]
                    X_caption.append(caption_seq)

                    # The target output is the next word in the sequence (y)
                    # We exclude the first token for the target (i.e., predicting the next word)
                    y.append(caption_seq[1:])
                else:
                    print(f"Skipping non-string caption: {caption}")

    # Pad input sequences (X_caption) and target sequences (y) to max_caption_length
    if len(X_caption) == 0:
        raise ValueError("No valid captions found. Check the format of your captions.json file.")

    X_caption = pad_sequences(X_caption, maxlen=max_caption_length, padding='post')

    # Pad y sequences to max_caption_length - 1, because the last word is missing in the output
    y = pad_sequences(y, maxlen=max_caption_length - 1, padding='post')

    return np.array(X_image), np.array(X_caption), np.array(y)

X_image, X_caption, y = prepare_training_data(image_features, captions_data, tokenizer, max_caption_length)

# Check that the data is not empty
if X_image.size == 0 or X_caption.size == 0 or y.size == 0:
    raise ValueError("One or more of the input data arrays are empty.")

print(f"Prepared {len(X_image)} image features, {len(X_caption)} captions, and {len(y)} target sequences.")

# Step 4: Create the image captioning model
def create_captioning_model(vocab_size, max_caption_length, embedding_dim=256, lstm_units=512):
    # Image feature input (shape: image_feature_size,)
    image_input = Input(shape=(256,), name="image_input")
    image_embedding = Dense(256, activation='relu')(image_input)  # Ensure same shape as LSTM output
    
    # Caption input
    caption_input = Input(shape=(max_caption_length,), name="caption_input")
    caption_embedding = Embedding(vocab_size, embedding_dim, input_length=max_caption_length)(caption_input)
    
    # LSTM to process the embedded caption
    caption_lstm = LSTM(lstm_units, return_sequences=True)(caption_embedding)
    
    # Adjust the dimension of the caption LSTM output to match image embedding
    caption_lstm_projected = Dense(256, activation='relu')(caption_lstm)  # Matching dimensions
    
    # Combine image features and caption embeddings (both are now 256)
    combined = Add()([image_embedding, caption_lstm_projected])  # Shapes are now compatible
    
    # Further processing
    combined = Dense(lstm_units, activation='relu')(combined)
    combined = Dropout(0.5)(combined)
    
    # Change the output layer to predict vocabulary words at each time step
    combined = Dense(vocab_size, activation='softmax')(combined)  # Output shape [batch_size, max_caption_length - 1, vocab_size]
    
    # Create and return the model
    model = Model(inputs=[image_input, caption_input], outputs=combined)
    
    return model

# Compile the model
def compile_model(model):
    optimizer = Adam()
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Updated for TensorFlow 2.x
    metrics = ['accuracy']
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    return model

# Create the model
model = create_captioning_model(vocab_size, max_caption_length)

# Compile the model
model = compile_model(model)

# Step 5: Train the model
def train_captioning_model(model, X_image, X_caption, y, epochs=10, batch_size=32):
    """
    Train the model on the given image features and captions.
    """
    model.fit([X_image, X_caption], y, epochs=epochs, batch_size=batch_size, validation_split=0.1)

# Train the model
train_captioning_model(model, X_image, X_caption, y, epochs=10)

Captions data loaded. Previewing first few entries:
Image ID: img_1.jpg, Captions: ['A group of people standing on a beach.', 'A person holding a surfboard.', 'Waves crashing on the shore.']
Image ID: img_2.jpg, Captions: ['A dog running in a park.', 'A brown dog chasing a ball.', 'Sunset in the background.']
Image ID: img_3.jpg, Captions: ['A family having a picnic.', 'People sitting on a blanket.', 'Eating sandwiches and smiling.']
Loaded 1 image features
Loaded 3 captions data entries
Image IDs in image_features: ['Photo']
Image IDs in captions_data: ['img_1.jpg', 'img_2.jpg', 'img_3.jpg']
Max caption length: 8
Vocabulary size: 33


ValueError: No valid captions found. Check the format of your captions.json file.