In [16]:
import tensorflow as tf
import import_ipynb
import cv2
import numpy as np
from Attention_Model1 import encoder, decoder, tokenizer, greedy_search_predict

# Clear any previous TensorFlow sessions
tf.keras.backend.clear_session()

# Define input layers
image1 = tf.keras.layers.Input(shape=(224, 224, 3))
image2 = tf.keras.layers.Input(shape=(224, 224, 3))
caption = tf.keras.layers.Input(shape=(26,))

# Connect the input layers to your custom encoder layer
encoder_output = encoder(image1, image2)  # Assuming encoder takes a list of input tensors

# Connect the encoder output and caption input to your custom decoder layer
output = decoder()(encoder_output, caption)  # Assuming decoder takes a list of input tensors

# Define the model
model = tf.keras.Model(inputs=[image1, image2, caption], outputs=output)

# Display the model summary
model.summary()


In [21]:
from grammify import restructure
def greedy_search_predict(image1, image2, model=model, weights_file='Encoder_Decoder_global_attention.h5'):
    
    # Rest of the function remains the same
    image1 = cv2.imread(image1, cv2.IMREAD_UNCHANGED) / 255
    image2 = cv2.imread(image2, cv2.IMREAD_UNCHANGED) / 255
    image1 = tf.expand_dims(cv2.resize(image1, (224,224), interpolation=cv2.INTER_NEAREST), axis=0)
    image2 = tf.expand_dims(cv2.resize(image2, (224,224), interpolation=cv2.INTER_NEAREST), axis=0)
    model.load_weights('Encoder_Decoder_global_attention.h5')
    image1 = model.get_layer('image_encoder')(image1)
    image2 = model.get_layer('image_encoder')(image2)
    image1 = model.get_layer('bkdense')(image1)
    image2 = model.get_layer('bkdense')(image2)

    concat = model.get_layer('concatenate')([image1, image2])
    enc_op = model.get_layer('encoder_batch_norm')(concat)
    enc_op = model.get_layer('encoder_dropout')(enc_op)

    decoder_h, decoder_c = tf.zeros_like(enc_op[:, 0]), tf.zeros_like(enc_op[:, 0])
    a = []
    pred = []
    for i in range(26):
        if i == 0:
            caption = np.array(tokenizer.texts_to_sequences(['0']))
        output, decoder_h, attention_weights = model.get_layer('decoder').onestepdecoder(caption, enc_op, decoder_h)

        max_prob = tf.argmax(output, axis=-1)
        caption = np.array([max_prob])
        if max_prob == np.squeeze(tokenizer.texts_to_sequences(['0'])):
            break
        else:
            a.append(tf.squeeze(max_prob).numpy())
    return restructure(tokenizer.sequences_to_texts([a])[0])
print("The generated caption is: ", greedy_search_predict('image001.png', 'image001.png'))






The generated caption is:  Impression:
- Stable subsegmental atelectasis and slight left basilar airspace disease.
- The left basilar silhouette represents congestion and edema of the lobes.


In [None]:
import sys
import cv2
import numpy as np
from Attention_Model1 import greedy_search_predict

def generate_caption(image_path1, image_path2):
    # Load and preprocess images
    image1 = cv2.imread(image_path1)
    image1 = cv2.resize(image1, (224, 224))  # Resize image if necessary
    image1 = image1.astype(np.float32) / 255.0  # Normalize image

    image2 = cv2.imread(image_path2)
    image2 = cv2.resize(image2, (224, 224))  # Resize image if necessary
    image2 = image2.astype(np.float32) / 255.0  # Normalize image

    # Generate caption
    caption = greedy_search_predict(image1, image2)

    return caption

if __name__ == "__main__":
    # Check if correct number of arguments are provided
    if len(sys.argv) != 3:
        print("Usage: python generate_caption.py <image_path1> <image_path2>")
        sys.exit(1)

    # Get image paths from command-line arguments
    image_path1 = sys.argv[1]
    image_path2 = sys.argv[2]
    print("Image1: ", image_path1)
    print("Image2: ", image_path2)

    # Generate caption
    caption = generate_caption(image_path1, image_path2)

    print("The generated caption is:", caption)
