# Load drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Step 1 — Importing required libraries for Image Captioning.

In [None]:
# Install the sentencepiece library, often used for text tokenization
!pip install sentencepiece

In [None]:
# Install the sentencepiece library, often used for text tokenization
!pip install sentencepiece

# Import necessary libraries
import os  # For operating system operations like file handling
import pickle  # For serializing and deserializing Python objects
import numpy as np  # For numerical computations
from tqdm.notebook import tqdm  # For displaying progress bars in notebooks
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
from textwrap import wrap  # For wrapping text
import cv2  # OpenCV library for image processing
import re  # For regular expressions
import random  # For generating random numbers
from PIL import Image, ImageFilter  # For image processing using the Pillow library

# Import TensorFlow and Keras libraries for deep learning
from tensorflow.keras.preprocessing.text import Tokenizer  # For text tokenization
from keras import applications  # For accessing pre-trained models and other utilities
from tensorflow.keras.preprocessing.image import load_img, img_to_array  # For loading and converting images
from tensorflow.keras.applications.efficientnet import EfficientNetB7, preprocess_input  # For using EfficientNetB7 model and preprocessing

# Import additional Keras utilities for processing sequences and categorical data
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For padding sequences to the same length
from tensorflow.keras.utils import to_categorical, plot_model  # For converting labels to categorical and plotting the model architecture
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add, Reshape, concatenate  # For building neural network layers
from tensorflow.keras.models import Model  # For creating a Keras model
from keras import callbacks  # For using callbacks during training
from keras.models import Sequential  # For creating a sequential Keras model
from tensorflow.keras.models import load_model  # For loading a saved Keras model
from keras.layers import Bidirectional  # For creating bidirectional LSTM layers

# The following sections would typically include:
# - Data loading and preprocessing
# - Model architecture definition
# - Compilation and training of the model
# - Evaluation and inference steps


# Step 2 - Visualization of flickr dataset

In [None]:
# path to the image dataset
img_dir='/content/drive/MyDrive/Mini Project/data/Images'
# path to the caption dataset
cap_dir='/content/drive/MyDrive/Mini Project/youtube/captions.txt'

In [None]:
def readImage(path):
    """
    Reads an image from the specified path, resizes it to 250x250 pixels,
    converts it to an array, and normalizes the pixel values to the range [0, 1].

    Args:
    path (str): The path to the image file.

    Returns:
    numpy.ndarray: The processed image array.
    """
    # Load the image with the specified color mode and target size
    img = load_img(path, color_mode='rgb', target_size=(250, 250))
    # Convert the image to an array
    img = img_to_array(img)
    # Normalize the image array to [0, 1] range
    img = img / 255.0
    return img

def display_images(temp_df):
    """
    Displays a grid of 15 images and their corresponding captions from the DataFrame.

    Args:
    temp_df (pandas.DataFrame): A DataFrame containing image filenames and captions.
    """
    # Reset the index of the DataFrame
    temp_df = temp_df.reset_index(drop=True)
    # Set the figure size for the plot
    plt.figure(figsize=(15, 15))
    n = 0  # Initialize the subplot index
    # Loop through the first 15 images in the DataFrame
    for i in range(15):
        n += 1  # Increment the subplot index
        # Create a subplot
        plt.subplot(5, 5, n)
        # Adjust the spacing between subplots
        plt.subplots_adjust(hspace=0.7, wspace=0.3)
        # Read and process the image
        image = readImage(img_dir + '/' + temp_df.image[i])
        # Display the image
        plt.imshow(image)
        # Display the image caption with word wrapping
        plt.title("\n".join(wrap(temp_df.caption[i], 20)))
        # Hide the axis
        plt.axis("off")


In [None]:
# Read the CSV file containing image captions into a DataFrame
captionlist = pd.read_csv(cap_dir)

# Sample 15 random entries from the DataFrame and display the corresponding images and captions
display_images(captionlist.sample(15))


In [None]:
# now save this training feature to drive
print('length of feature vectors for training : ',len(features))

#saving in drive
print('saving.....')
pickle.dump(features, open('/content/drive/MyDrive/Mini Project/minorproject/vgg16placeshybridfeatures.pkl', 'wb'))
print('Shape of a feature vector : ',features['2513260012_03d33305cf'].shape)
print('saved!')

# Step 3 — Extract features from the images using EfficientNetB7.

In [None]:
# No of images in the dataset
print(len(os.listdir('/content/drive/MyDrive/Mini Project/data/Images')))

In [None]:
# EfficientNetB7 is used to extract features from the image; the fully connected layer is not used (include_top=False) to get features only
model = EfficientNetB7(weights='imagenet', include_top=False)

# Restructure the model to output the features from the desired layer (last convolutional layer)
model = Model(inputs=model.inputs, outputs=model.layers[-3].output)

# Freeze the layers to prevent them from being trainable
for layer in model.layers:
    layer.trainable = False

# Print the model summary to see the architecture and layers
print(model.summary())


In [None]:
from keras.preprocessing.image import load_img, img_to_array
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt

# Path to your image
img_path = '/content/drive/MyDrive/Mini Project/data/Images/2513260012_03d33305cf.jpg'

# Load the image with bicubic interpolation using Keras
img = load_img(img_path, target_size=(600, 600), interpolation='lanczos')

# Convert the image to a NumPy array
img_array = img_to_array(img)

# Convert the NumPy array to PIL Image
pil_img = Image.fromarray(img_array.astype('uint8'))

# Apply edge enhancement filter on the image
enhanced_image = pil_img.filter(ImageFilter.EDGE_ENHANCE)

# Convert the enhanced image back to NumPy array
enhanced_img_array = img_to_array(enhanced_image)

# Display the original and enhanced images
plt.subplot(1, 2, 1)
plt.imshow(img_array / 255.0)  # Normalize pixel values for display
plt.title('Original Image')

plt.subplot(1, 2, 2)
plt.imshow(enhanced_img_array / 255.0)  # Normalize pixel values for display
plt.title('Enhanced Image (Edge Enhancement)')

plt.show()


In [None]:
# to show the resize image. demo purpose only

from keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt

# Path to your image
img_path = '/content/drive/MyDrive/Mini Project/data/Images/2513260012_03d33305cf.jpg'

# Load the original image
img2 = load_img(img_path)

# Load the image with bicubic interpolation and resize to 600x600 pixels
img = load_img(img_path, target_size=(600, 600), interpolation='lanczos')

# Convert the resized image to a NumPy array
img_array = img_to_array(img)

# Display the original and resized images
plt.subplot(1, 2, 1)
plt.imshow(img2)
plt.title('Original Image')

plt.subplot(1, 2, 2)
plt.imshow(img_array / 255.0)  # Normalize pixel values for display
plt.title('Resized Image (Bicubic Interpolation)')

plt.show()


In [None]:
# extract features from image
features = {}
directory = '/content/drive/MyDrive/Mini Project/data/Images'

# Iterate through each image in the directory
for img_name in tqdm(os.listdir(directory)):
    # Load the image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(600, 600), interpolation='lanczos')

    # Convert image pixels to numpy array
    image = img_to_array(image)

    # Convert the NumPy array to PIL Image
    pil_img = Image.fromarray(image.astype('uint8'))

    # Apply edge enhancement filter on the image
    enhanced_image = pil_img.filter(ImageFilter.EDGE_ENHANCE)

    # Convert the enhanced image back to NumPy array
    image = img_to_array(enhanced_image)

    # Reshape data for model input
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))

    # Preprocess image for EfficientNetB7
    image = preprocess_input(image)

    # Extract features using the pre-trained EfficientNetB7 model
    feature = model.predict(image, verbose=0)

    # Get image ID from the filename (remove the extension)
    image_id = img_name.split('.')[0]

    # Store extracted feature in the dictionary
    features[image_id] = feature


In [None]:
# Print the length of the feature vectors for training
print('length of feature vectors for training : ', len(features))

# Saving the feature vectors to Google Drive
print('saving.....')
pickle.dump(features, open('/content/drive/MyDrive/Mini Project/minorproject/edgeehancedefficient.pkl', 'wb'))

# Print the shape of a specific feature vector for verification
print('Shape of a feature vector : ', features['2513260012_03d33305cf'].shape)

# Confirm the features have been saved
print('saved!')


# Step 4 — Load the feature


In [None]:
# Load features from a pickle file
with open('/content/drive/MyDrive/Mini Project/efficientnetb7/allextractedfeature.pkl', 'rb') as f:
    features = pickle.load(f)


In [None]:
# Print the shape of a specific feature vector
print('Shape of a feature vector : ', features['2513260012_03d33305cf'].shape)

# Print the total number of images (keys) in the features dictionary
print("Total images : ", len(features.keys()))


Skip Step 5 and step 6 if paraphrased already.

# Step 5 — Load descriptions.

Only need to run Step 5 once, then run Step 6 to paraphrase.

In [None]:
# Open and read the contents of the captions.txt file located in Google Drive
with open(os.path.join('/content/drive/MyDrive/Mini Project/youtube/captions.txt'), 'r') as f:
    next(f)  # Skip the first line
    captions_doc = f.read()

# Print the contents of the captions document
print(captions_doc)


In [None]:
# Create a mapping of image to captions
mapping = {}

# Process each line in the captions document
for line in tqdm(captions_doc.split('\n')):
    # Split the line by comma(,)
    tokens = line.split(',')

    # Skip lines with fewer than 2 tokens
    if len(tokens) < 2:
        continue

    # Extract image ID and captions
    image_id, caption = tokens[0], tokens[1:]

    # Remove extension from image ID
    image_id = image_id.split('.')[0]

    # Convert caption list to string
    caption = " ".join(caption)

    # Create a list in the mapping dictionary if needed
    if image_id not in mapping:
        mapping[image_id] = []

    # Store the caption
    mapping[image_id].append(caption)


In [None]:
len(mapping)

In [None]:
print('Before cleaning : \n')
mapping['1000268201_693b08cb0e']

In [None]:
with open('/content/drive/MyDrive/Mini Project/minorproject/paramapping.pkl', 'wb') as f:
    # Pickle the dictionary
    pickle.dump(mapping, f)

# Step 6 — Paraphrase sentences.

In [None]:
# Open the file in binary read mode
with open('/content/drive/MyDrive/Mini Project/minorproject/paramapping.pkl', 'rb') as f:
    # Unpickle the dictionary
    paramapping = pickle.load(f)

# Print the dictionary
print(paramapping['1000268201_693b08cb0e'])

In [None]:
# Initialize a counter for images with 10 descriptions
i = 0

# Iterate through each image ID and its descriptions in paramapping
for image_id, descriptions in tqdm(paramapping.items()):
    # Check if the image has exactly 10 descriptions
    if len(descriptions) == 10:
        i += 1

# Print the count of images with 10 descriptions
print(i)


In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text):
    # Tokenize the input text
    batch = tokenizer([input_text], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(torch_device)

    # Generate paraphrased text
    translated = model.generate(**batch, max_length=60, num_beams=1, num_return_sequences=1, temperature=1.5)

    # Decode the generated tokens to text
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

    return tgt_text


In [None]:
def paraphrase_and_yield(mapping):
    # Iterate through each image ID and its descriptions in the mapping
    for image_id, descriptions in tqdm(mapping.items()):
        # Check if there are exactly 10 descriptions
        if len(descriptions) != 10:
            # Paraphrase each description using get_response()
            paraphrased_descriptions = [get_response(description) for description in descriptions]

            # Flatten the list of paraphrased descriptions
            flat_list = []
            for sublist in paraphrased_descriptions:
                flat_list.extend(sublist)

            # Extend the original descriptions with the paraphrased versions
            mapping[image_id].extend(flat_list)

            # Yield the updated mapping
            yield mapping


In [None]:
# Iterate through the updated mappings yielded by paraphrase_and_yield(paramapping)
for updated_mapping in paraphrase_and_yield(paramapping):
    # Save each updated_mapping as a pickle file
    with open('/content/drive/MyDrive/Mini Project/minorproject/paramapping.pkl', 'wb') as f:
        pickle.dump(updated_mapping, f)


# Step 7 — Clean and Save image descriptions.

In [None]:
# Open the file in binary read mode
with open('/content/drive/MyDrive/Mini Project/minorproject/paramapping.pkl', 'rb') as f:
    # Unpickle the dictionary
    mapping = pickle.load(f)

# Print the dictionary
print(mapping['1000268201_693b08cb0e'])

In [None]:
# function to clean the description

def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc.,
            caption = re.sub('[^A-Za-z ]', '', caption)
            # delete additional spaces
            caption = re.sub('\s+', ' ', caption).strip()
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
print('Before cleaning : \n')
mapping['1000268201_693b08cb0e']

In [None]:
# preprocess the text
clean(mapping)

In [None]:
print('After cleaning : \n')

mapping['1000268201_693b08cb0e']

# Step 8 -Tokenize the caption (All caption are tokenized fully)

In [None]:
all_captions = []

# Iterate through each key in the mapping dictionary
for key in mapping:
    # Iterate through each caption in the list associated with the current key
    for caption in mapping[key]:
        # Append the caption to the all_captions list
        all_captions.append(caption)


In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
def search_sentences(sentences, target_word):
    matching_sentences = []
    for sentence in sentences:
        if target_word.lower() in sentence.lower():
            matching_sentences.append(sentence)
    return matching_sentences

# Example usage with all_captions and target_word
target_word = "four"

matching_sentences = search_sentences(all_captions, target_word)

# Print the matching sentences
print(f"Sentences containing the word '{target_word}':")
for sentence in matching_sentences:
    print(sentence)


In [None]:
'seven' in tokenizer.word_index.keys()

In [None]:
vocab_size

In [None]:
# Calculate the maximum length of captions in terms of number of words
max_length = max(len(caption.split()) for caption in all_captions)

# Print the maximum length
print(max_length)


In [None]:
# count the words
# Get the word counts
word_counts = tokenizer.word_counts

# Convert to DataFrame
word_counts_df = pd.DataFrame(list(word_counts.items()), columns=['Word', 'Count'])

# Sort the DataFrame by count in descending order
word_counts_df = word_counts_df.sort_values(by='Count', ascending=False)
# Reset the index to keep the original index
word_counts_df = word_counts_df.reset_index(drop=True)

# Print the DataFrame
print(word_counts_df)


In [None]:
# display the count of the words

topn = 50


# Function to plot histogram
def plthist(dfsub, title="The top 50 most frequently appearing words"):
    plt.figure(figsize=(30,3))
    plt.bar(dfsub.index,dfsub['Count'],color ='r')
    plt.yticks(fontsize=20,color ='b')
    plt.xticks(dfsub.index,dfsub["Word"],rotation=90,fontsize=20,color ='b')
    plt.title(title,fontsize=20)
    plt.show()


# Plot the top 50 most frequent words
plthist(word_counts_df.iloc[:topn,:],
        title="The top 50 most frequently appearing words")

# Plot the least 50 most frequent words
plthist(word_counts_df.iloc[-topn:,:],
        title="The least 50 most frequently appearing words")

#Step 9 - Embeddding word with Glove.6b

In [None]:
tokenizer.word_index['car']

In [None]:
# emb_mat[119]

In [None]:
emb_dim=200
#len(word_map)=9385
emb_mat= np.zeros((vocab_size,emb_dim))

In [None]:
emb_mat.shape

In [None]:
# Load GloVe embeddings and populate emb_mat for words in tokenizer's word_index
with open('/content/drive/MyDrive/Mini Project/data/glove.6B.200d.txt') as f:
        for line in f:
          word, *emb = line.split()
          if word in tokenizer.word_index.keys():
            emb_mat[tokenizer.word_index[word]]=np.array(emb,dtype="float32")[:emb_dim]

# Step 10 - Train Test Split

In [None]:
type(mapping)

In [None]:
# Get list of image_ids from mapping dictionary keys
image_ids = list(mapping.keys())

# Define the split ratio
split_ratio = 0.889878

# Calculate the index to split the data
split_index = int(len(image_ids) * split_ratio)

# Split image_ids into training and test sets
train = image_ids[:split_index]
test = image_ids[split_index:]

# Print the lengths of the training and test sets
print("Length of test set:", len(test))
print("Length of train set:", len(train))


In [None]:
train[0]

# Step 11 — Data genertor with batch size to make it less memory

In [None]:
# '<start> girl going into wooden building <end>'
# X                                                      y
# <start>                                                girl
# <start> girl                                           going
# <start> girl going                                     into
# <start> girl going into                                wooden
# <start> girl going into wooden                         building
# <start> girl going into wooden building                <end>
# <start> girl going into wooden building <end>

In [None]:
# JUST FOR CHECKING?
cap=mapping['386656845_4e77c3e3da'][0]
print(cap)
seq=tokenizer.texts_to_sequences([cap])[0]
print(seq, '\n\n')
for i in range(1, len(seq)):
  #  split into input and output pairs
  in_seq, out_seq = seq[:i], seq[i]
  # pad input sequence
  in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
  print('in_seq : ',in_seq)
  # # encode out sequence
  out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
  print('out_seq : ',len(out_seq)) #here we use to_categorical to make one hot encoding of output having size of vocab_size


In [None]:
type(features[train[0]])

In [None]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)

            # Yield the batch data if batch size is reached
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

# Step 12 - Model Creation and saving

In [None]:
# Image input
inputs1 = Input(shape=(2560,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
fe3 = Reshape((1, 256))(fe2)  # Reshape for concatenation later

# Text input
inputs2 = Input(shape=(max_length,))
emb1 = Embedding(input_dim=vocab_size, output_dim=emb_dim, weights=[emb_mat], trainable=False)(inputs2)
dr1 = Dropout(0.2)(emb1)
lstm1 = LSTM(128, return_sequences=True)(dr1)
dr2 = Dropout(0.2)(lstm1)
lstm2 = LSTM(256, return_sequences=False)(dr2)  # Only need final state for concatenation

# Concatenate image and text features
concatenated = concatenate([fe3, lstm2], axis=1)

# Additional layers
conc1 = LSTM(256)(concatenated)
conc2 = Dropout(0.4)(conc1)

# Decoder layers
decoder1 = add([fe2, conc2])  # Skip connection
decoder2 = Dense(1000, activation='relu')(decoder1)
output = Dense(vocab_size, activation='softmax')(decoder2)

# Define the model
model = Model(inputs=[inputs1, inputs2], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Plot the model architecture
plot_model(model, show_shapes=True)


In [None]:
# train the model
epochs = 20
batch_size = 41
steps = len(train) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

In [None]:
# save the model
model.save('/content/drive/MyDrive/Mini Project/minorproject'+'/bluescore90highest.keras')

In [None]:
model.save_weights('/content/drive/MyDrive/Mini Project/minorproject'+'/bluescore90highestweight.keras')

# Step 13 - Load the final Model

In [None]:
# load model
model=load_model('/content/drive/MyDrive/Mini Project/minorproject'+'/bluescore90highest.keras')

In [None]:
model.load_weights('/content/drive/MyDrive/Mini Project/minorproject'+'/bluescore90highestweight.keras')

# Step 14 - Generate Captions for the Image

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break

    return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in tqdm(test):
    # get actual caption
    captions = mapping[key]
    # predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)



In [None]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in tqdm(test):
    # get actual caption
    captions = mapping[key]
    # split into words
    actual_captions = [caption.split() for caption in captions]
    # append to the list
    actual.append(actual_captions)



In [None]:
# to take only 5 captions for testing
result = [
    list_of_lists[::2]
    for list_of_lists in actual
]

# Printing the result
print(result)

In [None]:
# to make list into sentences

actualsentences = [
    [
        ' '.join(inner_list)
        for inner_list in outer_list
    ]
    for outer_list in result
]

predictedsentences = [
        ' '.join(inner_list)
    for inner_list in predicted
]

# Printing the sentences
print(actualsentences)
print(predictedsentences)
print(len(actualsentences))
print(len(predictedsentences))

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# from nltk.translate.glue_score import corpus_glue
from nltk.translate.chrf_score import corpus_chrf
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')


print("The cumulative and individual 1-gram BLEU use the same weights, e.g. (1, 0, 0, 0)")
# print('Sentence Bleu 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Corpus Bleu 1-gram: %f' % corpus_bleu(result, predicted, weights=(1, 0, 0, 0)))
print("\n")

print("The cumulative and individual 2-gram BLEU use the same weights, e.g. (0, 1, 0, 0)")
# print('Sentence Bleu 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))
print('Corpus Bleu 2-gram: %f' % corpus_bleu(result, predicted, weights=(0, 1, 0, 0)))
print("\n")

print("The cumulative and individual 3-gram BLEU use the same weights, e.g. (0, 0, 1, 0)")
# print('Sentence Bleu 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))
print('Corpus Bleu 3-gram: %f' % corpus_bleu(result, predicted, weights=(0, 0, 1, 0)))
print("\n")

print("The cumulative and individual 4-gram BLEU use the same weights, e.g. (0, 0, 0, 1)")
# print('Sentence Bleu 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))
print('Corpus Bleu 4-gram: %f' % corpus_bleu(result, predicted, weights=(0, 0, 0, 1)))
print("\n")

print("The 2-gram weights assign a 50% to each of 1-gram and 2-gram")
print('Corpus Bleu 1-gram: %f' % corpus_bleu(result, predicted, weights=(0.5, 0.5, 0, 0)))
print("\n")

print("3-gram weights are 33% for each of the 1, 2 and 3-gram scores")
print('Corpus Bleu 1-gram: %f' % corpus_bleu(result, predicted, weights=(0.33, 0.33, 0.33, 0)))
print("\n")

print("4-gram weights are 25% for each of the 1, 2 3 and 4-gram scores")
print('Corpus Bleu 1-gram: %f' % corpus_bleu(result, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

print("\n\n")
print('Corpus Glue Score: %f' % corpus_bleu(result, predicted))

print("\n\n")
totmeteor = 0
for sen in range(len(predicted)):
  totmeteor+=float(round(meteor_score(result[sen],predicted[sen]),4))
print('Meteor Score: ', totmeteor/len(predicted))
print("\n\n")

import evaluate
rouge = evaluate.load('rouge')
roguescore = rouge.compute(predictions=predictedsentences, references=actualsentences)['rougeL']
print('Rogue score: ',roguescore)



# Step 15 - Visualize the Results

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    img_path = os.path.join('/content/drive/MyDrive/Mini Project/data/Images/', image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(actual)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("1001773457_577c3a7d70.jpg")

In [None]:
generate_caption("1002674143_1b742ab4b8.jpg")

In [None]:
generate_caption("101669240_b2d3e7f17b.jpg")

# Product - Test with Real Image

In [None]:
efficientnet_model = EfficientNetB7()
# restructure the model
efficientnet_model = Model(inputs=efficientnet_model.inputs, outputs=efficientnet_model.layers[-3].output)
for layer in efficientnet_model.layers:
    layer.trainable = False

In [None]:
# to convert the caption to malayalam english to malayalam api

!pip install deep-translator
from deep_translator import GoogleTranslator
translated = GoogleTranslator(source='auto', target='ml').translate("keep it up, you are awesome")

def translate_to_malayalam(text):
    result = GoogleTranslator(source='en', target='ml').translate(text)
    return result


In [None]:
# image_path = '/content/drive/MyDrive/Mini Project/data/fortesting.jpeg'
image_path = '/content/drive/MyDrive/Mini Project/data/fortest/Copy of 000000581585.jpg'


img = Image.open(image_path)
plt.imshow(img)
# load image
image = load_img(image_path, target_size=(600, 600), interpolation='lanczos')
# convert image pixels to numpy array
image = img_to_array(image)
# Convert the NumPy array to PIL Image
pil_img = Image.fromarray(image.astype('uint8'))

# Apply edge enhancement filter on the image
enhanced_image = pil_img.filter(ImageFilter.EDGE_ENHANCE)

# Convert the enhanced image back to NumPy array
image = img_to_array(enhanced_image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = efficientnet_model.predict(image, verbose=0)
# predict from the trained model
pr=predict_caption(model, feature, tokenizer, max_length)
# print(pr)
pr=pr.split(' ')
pr=' '.join(pr[1:-1])
print(pr)
print(translate_to_malayalam(pr))

In [None]:
plt.rcParams["font.family"] = "sans-serif"

fig, axs = plt.subplots(2, 5, figsize=(15, 6))
malayalam_caption = {}
# Iterate over each image in the folder
for i, filename in enumerate(os.listdir('/content/drive/MyDrive/Mini Project/data/fortest')):
    if filename.endswith(".jpg") or filename.endswith(".png"):  # Check if it's an image file
        # Load and process the image
        image_path = os.path.join('/content/drive/MyDrive/Mini Project/data/fortest', filename)
        img = Image.open(image_path)
        image = load_img(image_path, target_size=(600, 600), interpolation='lanczos')
        image = img_to_array(image)
        pil_img = Image.fromarray(image.astype('uint8'))
        enhanced_image = pil_img.filter(ImageFilter.EDGE_ENHANCE)
        image = img_to_array(enhanced_image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)

        # Extract features
        feature = efficientnet_model.predict(image, verbose=0)

        # Predict caption
        pr = predict_caption(model, feature, tokenizer, max_length)
        pr = pr.split(' ')
        pr = ' '.join(pr[1:-1])

        # Translate to Malayalam
        malayalam_caption[pr] = translate_to_malayalam(pr)

        # Display the image with the predicted caption as the title
        axs[i // 5, i % 5].imshow(img)
        axs[i // 5, i % 5].set_title(pr, fontsize=8)
        axs[i // 5, i % 5].axis('off')

# Adjust layout to prevent clipping of titles
plt.tight_layout()
plt.show()

In [None]:
malayalam_caption