**IMAGE CAPTIONING**

STEP 1: IMPORT LIBRARIES

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, add


STEP 2: LOAD PRE-TRAINED MODEL

In [2]:
# Using VGG16 for feature extraction
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)


STEP 3: PREPROCESS IMAGE

In [3]:
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

img_path = '/content/AI IMAGE.jpg'
img = preprocess_image(img_path)


STEP 4: EXTRACT FEATURES

In [4]:
features = model.predict(img)




STEP 5: PREPARE TEXT DATA

In [5]:
# Example captions
captions = ["a man riding a horse", "a person on a horse in a field"]

# Tokenize the captions
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1

# Convert captions to sequences
sequences = tokenizer.texts_to_sequences(captions)


STEP 6: CREATE EMBEDDING LAYER

In [6]:
embedding_dim = 256
max_length = max(len(seq) for seq in sequences)

# Embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)


STEP 7: BUILD THE MODEL

In [7]:
# Image feature input
image_input = Input(shape=(4096,))
image_model = Dropout(0.5)(image_input)
image_model = Dense(256, activation='relu')(image_model)

# Caption input
caption_input = Input(shape=(max_length,))
caption_model = embedding_layer(caption_input)
caption_model = Dropout(0.5)(caption_model)
caption_model = LSTM(256)(caption_model)

# Combine image and caption models
decoder = add([image_model, caption_model])
decoder = Dense(256, activation='relu')(decoder)
outputs = Dense(vocab_size, activation='softmax')(decoder)

# Define the model
model = Model(inputs=[image_input, caption_input], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')


STEP 8: TRAIN THE MODEL

In [8]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

# Load ResNet50 model pre-trained on ImageNet, exclude the top layer
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Extract image features
def extract_image_features(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(224, 224))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.applications.resnet50.preprocess_input(img)
    img = np.expand_dims(img, axis=0)
    features = resnet.predict(img)
    return features

# Example usage
image_path = '/content/AI IMAGE.jpg'
image_features = extract_image_features(image_path)




STEP 9: GENERATE CAPTIONS

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

# Example word index mapping (replace with actual mappings)
word_index = {'startseq': 1, 'endseq': 2, 'a': 3, 'cat': 4, 'sits': 5, 'on': 6, 'the': 7, 'mat': 8}
index_word = {index: word for word, index in word_index.items()}
vocab_size = len(word_index) + 1

max_caption_length = 20  # Example value

# Model architecture
embedding_dim = 256
lstm_units = 512

image_input = Input(shape=(2048,), name="image_input")
caption_input = Input(shape=(None,), name="caption_input")

embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(caption_input)
lstm = LSTM(lstm_units)(embedding)

# Transform image features to the same shape as the LSTM output
image_features_transformed = Dense(lstm_units, activation='relu')(image_input)

decoder1 = add([image_features_transformed, lstm])
decoder2 = Dense(lstm_units, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[image_input, caption_input], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

# Assuming the model has been trained and weights loaded
# model.load_weights('path_to_weights.h5')

def generate_caption(image_feature):
    caption = [word_index['startseq']]
    for _ in range(max_caption_length):
        sequence = pad_sequences([caption], maxlen=max_caption_length)
        y_pred = model.predict([image_feature, sequence], verbose=0)
        y_pred = np.argmax(y_pred)
        word = index_word[y_pred]
        caption.append(y_pred)
        if word == 'endseq':
            break
    return ' '.join([index_word[idx] for idx in caption if idx not in [word_index['startseq'], word_index['endseq']]])

# Example usage with dummy image features
features = np.random.rand(1, 2048)  # Example feature
caption = generate_caption(features)
print(caption)


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 caption_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 image_input (InputLayer)    [(None, 2048)]               0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            2304      ['caption_input[0][0]']       
                                                                                                  
 dense_3 (Dense)             (None, 512)                  1049088   ['image_input[0][0]']         
                                                                                            

STEP 10: EVALUATE THE MODEL

In [10]:
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Example usage
caption = generate_caption(model, tokenizer, features, max_length)
print(caption)


startseq a a a a a a a a


STEP 11: FINE-TUNE THE MODEL

In [11]:
print(tokenizer.word_index)


{'a': 1, 'horse': 2, 'man': 3, 'riding': 4, 'person': 5, 'on': 6, 'in': 7, 'field': 8}


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize tokenizer
tokenizer = Tokenizer()

# Example training data including special tokens
texts = [
    'startseq this is an example sentence endseq',
    'startseq another example here endseq'
]

# Fit tokenizer on your texts
tokenizer.fit_on_texts(texts)

# Add special tokens manually if needed
start_token = 'startseq'
end_token = 'endseq'

# Ensure they are in the tokenizer
if start_token not in tokenizer.word_index:
    tokenizer.word_index[start_token] = len(tokenizer.word_index) + 1
if end_token not in tokenizer.word_index:
    tokenizer.word_index[end_token] = len(tokenizer.word_index) + 1

print(tokenizer.word_index)


{'startseq': 1, 'example': 2, 'endseq': 3, 'this': 4, 'is': 5, 'an': 6, 'sentence': 7, 'another': 8, 'here': 9}


STEP 12: DEPLOY THE MODEL

In [13]:
# Save the model
model.save('image_captioning_model.h5')

# Load the model
loaded_model = tf.keras.models.load_model('image_captioning_model.h5')



  saving_api.save_model(


STEP 13: CONTINUOUS IMPROVEMENT

Collect feedback, retrain with new data, and iteratively improve the model based on performance.

STEP 14: HANDLING ERRORS AND EDGE CASES

In [14]:
try:
    # Code that might fail
    caption = generate_caption(features)
except Exception as e:
    print(f'Error generating caption: {e}')


Error generating caption: generate_caption() missing 3 required positional arguments: 'tokenizer', 'photo', and 'max_length'
