In [5]:
#Importing libraries
import os
import glob
import cv2
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, Input, Embedding, SimpleRNN, LSTM
from tensorflow.keras.models import Model
import tensorflow as tf

In [3]:
print(os.listdir('/kaggle/input'))

['indian-monuments-image-dataset']


In [6]:
def load_data_and_preprocess(base_path):
    class_names = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    IMG_SIZE = 224
    data = []
    
    for class_name in class_names:
        try:
            files = glob.glob(os.path.join(base_path, class_name, "*"))
            for f in files:
                img = cv2.imread(f)
                if img is None or img.size == 0:
                    print(f"Error loading image: {f}")
                    continue
                
                img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                data.append([np.array(img), class_names.index(class_name), f])
        except Exception as e:
            print(f"Error processing class {class_name}: {str(e)}")
    
    np.random.shuffle(data)
    return data, class_names

# Load Training and Testing Data
train_base_path = '/kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/train'
test_base_path = '/kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/test'

training_data, training_class_names = load_data_and_preprocess(train_base_path)
testing_data, testing_class_names = load_data_and_preprocess(test_base_path)

print(training_class_names)


Error loading image: /kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/train/mysore_palace/84.html
Error loading image: /kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/test/Khajuraho/Chhota_Imambara
Error loading image: /kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/test/Khajuraho/mysore_palace
Error loading image: /kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/test/Khajuraho/lotus_temple
['tajmahal', 'iron_pillar', 'Ellora Caves', 'Sun Temple Konark', 'Fatehpur Sikri', 'hawa mahal pics', 'alai_darwaza', 'charminar', 'Khajuraho', 'tanjavur temple', 'qutub_minar', 'Humayun_s Tomb', 'alai_minar', 'jamali_kamali_tomb', 'victoria memorial', 'basilica_of_bom_jesus', 'Chhota_Imambara', 'golden temple', 'Ajanta Caves', 'mysore_palace', 'Gateway of India', 'lotus_temple', 'Charar-E- Sharif', 'India gate pics']


In [7]:
# Image Preprocessing Function
def preprocess_image(img):
    img = img / 255.0
    return img

# Preprocess Training and Testing Data
for i, (img, label, _) in enumerate(training_data):
    training_data[i][0] = preprocess_image(img)

for i, (img, label, _) in enumerate(testing_data):
    testing_data[i][0] = preprocess_image(img)

# Separate and Encode Labels
training_labels = np.array([label for _, label, _ in training_data])
testing_labels = np.array([label for _, label, _ in testing_data])

encoder = OneHotEncoder(sparse_output=False)
encoder.fit(training_labels.reshape(-1, 1))

training_labels_onehot = encoder.transform(training_labels.reshape(-1, 1))
testing_labels_onehot = encoder.transform(testing_labels.reshape(-1, 1))

In [8]:
# Generate captions for images
def generate_captions_for_data(data):
    captions = {}
    
    for img, _, image_file in data:
        class_name = os.path.basename(os.path.dirname(image_file))
        caption = f'{class_name} located in India.'  # Modify this line as needed to generate actual captions
        captions[image_file] = caption
    
    return captions

# Generate captions for training and testing images
train_captions = generate_captions_for_data(training_data)
test_captions = generate_captions_for_data(testing_data)


In [9]:
# Combine train and test captions
all_captions = {**train_captions, **test_captions}

# Create captions.txt file
captions_file_path = '/kaggle/working/captions.txt'

with open(captions_file_path, 'w') as file:
    for image_path, caption in all_captions.items():
        file.write(f"{image_path}\t{caption}\n")

print(f"Captions file created at: {captions_file_path}")

Captions file created at: /kaggle/working/captions.txt


In [10]:
# Load Captions
captions_data = pd.read_csv(captions_file_path, sep='\t', header=None, names=['image_path', 'caption'])

# Text Preprocessing
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(captions_data['caption'])
vocab_size = len(tokenizer.word_index) + 1

# Convert captions to sequences and pad them
max_length = max(len(caption.split()) for caption in captions_data['caption'])
captions_sequences = tokenizer.texts_to_sequences(captions_data['caption'])
captions_padded = pad_sequences(captions_sequences, maxlen=max_length, padding='post')


In [11]:
# Load Captions
captions_data = pd.read_csv(captions_file_path, sep='\t', header=None, names=['image_path', 'caption'])

# Text Preprocessing
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(captions_data['caption'])
vocab_size = len(tokenizer.word_index) + 1

# Convert captions to sequences and pad them
max_length = max(len(caption.split()) for caption in captions_data['caption'])
captions_sequences = tokenizer.texts_to_sequences(captions_data['caption'])
captions_padded = pad_sequences(captions_sequences, maxlen=max_length, padding='post')


In [12]:
# Ensure the data is correctly split and prepared
images_train = np.array([item[0] for item in training_data])
captions_train = captions_padded[:len(training_data)]
labels_train = training_labels_onehot

images_val = np.array([item[0] for item in testing_data])
captions_val = captions_padded[len(training_data):]
labels_val = testing_labels_onehot

In [13]:
# Print dataset sizes
print("Training set sizes:")
print("images_train:", len(images_train))
print("captions_train:", len(captions_train))
print("labels_train:", len(labels_train))

print("\nValidation set sizes:")
print("images_val:", len(images_val))
print("captions_val:", len(captions_val))
print("labels_val:", len(labels_val))

Training set sizes:
images_train: 3746
captions_train: 3746
labels_train: 3746

Validation set sizes:
images_val: 1059
captions_val: 1059
labels_val: 1059


In [14]:
# Data generators for efficient batch processing
datagen = ImageDataGenerator(rescale=0.2)

train_generator = datagen.flow(images_train, labels_train, batch_size=32)
validation_generator = datagen.flow(images_val, labels_val, batch_size=32)


In [15]:
# Define CNN-RNN Model
def create_cnn_rnn_model(input_shape, vocab_size, max_length):
    cnn_input = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu')(cnn_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    rnn_input = Input(shape=(max_length,))
    y = Embedding(vocab_size, 128)(rnn_input)
    y = SimpleRNN(128)(y)
    
    combined = tf.keras.layers.concatenate([x, y])
    z = Dense(64, activation='relu')(combined)
    z = Dropout(0.5)(z)
    z = Dense(len(training_class_names), activation='softmax')(z)
    
    model = Model(inputs=[cnn_input, rnn_input], outputs=z)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

input_shape = (224, 224, 3)
cnn_rnn_model = create_cnn_rnn_model(input_shape, vocab_size, max_length)
cnn_rnn_model.summary()

In [16]:
# Define CNN-LSTM Model
def create_cnn_lstm_model(input_shape, vocab_size, max_length):
    cnn_input = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu')(cnn_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    lstm_input = Input(shape=(max_length,))
    y = Embedding(vocab_size, 128)(lstm_input)
    y = LSTM(128)(y)
    
    combined = tf.keras.layers.concatenate([x, y])
    z = Dense(64, activation='relu')(combined)
    z = Dropout(0.5)(z)
    z = Dense(len(training_class_names), activation='softmax')(z)
    
    model = Model(inputs=[cnn_input, lstm_input], outputs=z)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

cnn_lstm_model = create_cnn_lstm_model(input_shape, vocab_size, max_length)
cnn_lstm_model.summary()


In [17]:
# Train and evaluate CNN-RNN Model
cnn_rnn_model.fit(
    [images_train, captions_train],
    labels_train,
    epochs=5,
    validation_data=([images_val, captions_val], labels_val)
)

cnn_rnn_model.evaluate([images_val, captions_val], labels_val)


Epoch 1/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 2s/step - accuracy: 0.4163 - loss: 2.6620 - val_accuracy: 0.3267 - val_loss: 5.7848
Epoch 2/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 2s/step - accuracy: 0.9791 - loss: 0.1499 - val_accuracy: 0.3267 - val_loss: 7.8356
Epoch 3/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 2s/step - accuracy: 0.9939 - loss: 0.0573 - val_accuracy: 0.3267 - val_loss: 9.2098
Epoch 4/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 2s/step - accuracy: 0.9906 - loss: 0.0430 - val_accuracy: 0.3267 - val_loss: 10.2095
Epoch 5/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 2s/step - accuracy: 0.9961 - loss: 0.0233 - val_accuracy: 0.3267 - val_loss: 10.8948
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 386ms/step - accuracy: 0.3244 - loss: 10.9320


[10.894835472106934, 0.3267233371734619]

In [18]:
# Train and evaluate CNN-LSTM Model
cnn_lstm_model.fit(
    [images_train, captions_train],
    labels_train,
    epochs=5,
    validation_data=([images_val, captions_val], labels_val)
)

cnn_lstm_model.evaluate([images_val, captions_val], labels_val)

Epoch 1/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 2s/step - accuracy: 0.1324 - loss: 3.9267 - val_accuracy: 0.2087 - val_loss: 4.9289
Epoch 2/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 2s/step - accuracy: 0.8312 - loss: 0.5975 - val_accuracy: 0.3267 - val_loss: 7.9591
Epoch 3/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 2s/step - accuracy: 0.9652 - loss: 0.1638 - val_accuracy: 0.3267 - val_loss: 9.7467
Epoch 4/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 2s/step - accuracy: 0.9837 - loss: 0.0756 - val_accuracy: 0.3267 - val_loss: 10.7932
Epoch 5/5
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 2s/step - accuracy: 0.9919 - loss: 0.0490 - val_accuracy: 0.3267 - val_loss: 11.6505
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 402ms/step - accuracy: 0.3244 - loss: 11.8734


[11.65054988861084, 0.3267233371734619]

In [19]:
def generate_caption(image_path):
    img = cv2.imread(image_path)
    # Perform image preprocessing if necessary
    caption = f"A sample caption for image {image_path}"  # Replace this with your actual caption generation logic
    return caption
def predict_cnn_rnn_model(model, tokenizer, image_path):
    # Load and preprocess the image
    img = cv2.imread(image_path)
    img = cv2.resize(img, (224, 224))
    img = preprocess_image(img)
    img = np.expand_dims(img, axis=0)
    
    # Generate caption for the image
    caption = generate_caption(image_path)
    
    # Tokenize and pad the caption
    caption_sequence = tokenizer.texts_to_sequences([caption])
    caption_padded = pad_sequences(caption_sequence, maxlen=max_length, padding='post')
    
    # Predict using the model
    predictions = model.predict([img, caption_padded])
    predicted_class_idx = np.argmax(predictions)
    predicted_class = training_class_names[predicted_class_idx]
    
    return predicted_class

# Example usage:
image_path = '/kaggle/input/indian-monuments-image-dataset/Indian-monuments/images/train/Ellora Caves/(10).jpg'  # Replace with the path to your test image
predicted_class_cnn_rnn = predict_cnn_rnn_model(cnn_rnn_model, tokenizer, image_path)
print("Predicted class (CNN-RNN):", predicted_class_cnn_rnn)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341ms/step
Predicted class (CNN-RNN): charminar
