In [None]:
pip install tensorflow

# Created by: Shaffon Wazny

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
import numpy as np
from PIL import Image
import requests
from io import BytesIO

In [None]:
# Download pre-trained VGG model (excluding classification head)
base_model = VGG16(weights='imagenet')
image_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

In [None]:
# Function to preprocess and encode an image
def preprocess_and_encode_image(img_path):
    img = Image.open(requests.get(img_path, stream=True).raw)
    img = img.resize((224, 224))
    img_array = np.array(img)
    img_array = preprocess_input(img_array.reshape(1, 224, 224, 3))
    return image_model.predict(img_array)

In [None]:
# Sample image URL
sample_image_url = 'https://res.cloudinary.com/cloudinary-marketing/images/w_2000,h_1100/f_auto,q_auto/v1686254465/Blog-Ai-image-captioning/Blog-Ai-image-captioning-jpg?_i=AA'

In [None]:
# Test the image encoding function
encoded_image = preprocess_and_encode_image(sample_image_url)


In [None]:
# Download and preprocess the MS COCO dataset (captions and images)
# Build tokenizers for captions
tokenizer = Tokenizer()
captions = ["a sample caption 1", "another example caption 2", ...]  # Replace with your captions
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1


In [None]:
# Convert captions to sequences
sequences = tokenizer.texts_to_sequences(captions)
max_sequence_length = max(len(seq) for seq in sequences)

In [None]:
# Pad sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')


In [None]:
# Define the captioning model
embedding_dim = 256
units = 512

In [None]:
# Encoder
encoder_inputs = layers.Input(shape=(4096,))
encoder = layers.Dense(embedding_dim, activation='relu')(encoder_inputs)

In [None]:
# Decoder
decoder_inputs = layers.Input(shape=(None,))
decoder_embedding = layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = layers.LSTM(units, return_sequences=True, return_state=True)(decoder_embedding, initial_state=[encoder])
decoder_outputs = layers.Dense(vocab_size, activation='softmax')(decoder_lstm[0])


In [None]:
# Build the model
model = models.Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model (replace with your actual training data)
# model.fit([encoded_images, padded_sequences[:, :-1]], padded_sequences[:, 1:], epochs=num_epochs, batch_size=batch_size)

# Generate captions for a new image
def generate_caption(image_url):
    encoded_image = preprocess_and_encode_image(image_url)
    initial_state = [encoded_image.reshape(1, -1), np.zeros((1, units)), np.zeros((1, units))]
    
    # Generate caption using the trained model (replace with your actual model)
    # predicted_sequence = generate_caption_sequence(model, initial_state)
    
    # Convert predicted sequence back to text
    # predicted_caption = sequence_to_text(predicted_sequence)
    
    return predicted_caption


In [None]:
# Test the image caption generation function
generated_caption = generate_caption(sample_image_url)
print("Generated Caption:", generated_caption)