In [1]:
import tkinter as tk
from PIL import Image, ImageTk
from tkinter import filedialog

import webbrowser
from gtts import gTTS

import os
import sys
import pathlib

import joblib

import numpy as np
import keras
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings 
warnings.filterwarnings("ignore")

print("python {}".format(sys.version))
print("keras version {}".format(keras.__version__)); del keras
print("tensorflow version {}".format(tf.__version__))


# Configuring gpu for notebook
config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95
config.gpu_options.visible_device_list = "0"
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

# loading image processing Net
modelvgg = tf.keras.applications.VGG16(include_top=True, weights="C:\\Users\\Dell\\Downloads\\vgg16_weights_tf_dim_ordering_tf_kernels (1).h5")
modelvgg.layers.pop()
modelvgg = tf.keras.Model(inputs=modelvgg.inputs, outputs=modelvgg.layers[-2].output)
modelvgg.summary()

# Loading the caption generator from feature map
model = load_model("lstm1.hdf5")
print(model.summary())

# Loading the tokenizer for processing the output
tokenizer = joblib.load("transf (1).sav")

# Initializing the values
maxlen = 30
index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])

def upload_img():
    global img, image_data
    for img_display in frame.winfo_children():
        img_display.destroy()

    image_data = filedialog.askopenfilename(initialdir=os.getcwd(), title="Choose an image",
                                       filetypes=(("all files", "*.*"), ("png files", "*.png"), ("jpg files", "*.jpg")))
    basewidth = 300
    img = Image.open(image_data)
    wpercent = (basewidth / float(img.size[0]))
    hsize = int((float(img.size[1]) * float(wpercent)))
    img = img.resize((basewidth, hsize), Image.ANTIALIAS)
    img = ImageTk.PhotoImage(img)
    file_name = image_data.split('/')
    panel = tk.Label(frame, text= str(file_name[len(file_name)-1]).upper()).pack()
    panel_image = tk.Label(frame, image=img).pack()

def predict_caption(image):
    '''
    image.shape = (1,4462)
    '''
    in_text = 'startseq'

    for iword in range(maxlen):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence],maxlen)
        yhat = model.predict([image,sequence],verbose=0)
        yhat = np.argmax(yhat)
        newword = index_word[yhat]
        in_text += " " + newword
        if newword == "endseq":
            break
    return(in_text)

    
def caption():
    original = Image.open(image_data)
    original = original.resize((224, 224), Image.ANTIALIAS)
    numpy_image = img_to_array(original)
    nimage = preprocess_input(numpy_image)

    feature = modelvgg.predict(nimage.reshape( (1,) + nimage.shape[:3]))
    caption = predict_caption(feature)
    table = tk.Label(frame, text="Caption: " + caption[9:-7], font=("Helvetica", 12)).pack()
    l=""
    for i in range(9,len(caption)-7):
        l=l+caption[i]
    tts = gTTS(text=l, lang='en', slow=False)
    tts.save(os.path.join(os.getcwd(), "C:\\Users\\Dell\\Desktop\\Dataset\\saved_caption_folder", "welcome.mp3"))

def callback(url):
    webbrowser.open_new(url)

root = tk.Tk()
root.title('AUDIO CAPTION GENERATOR FOR IMAGES')
root.resizable(False, False)
tit = tk.Label(root, text="AUDIO CAPTION GENERATOR FOR IMAGES", padx=25, pady=6, font=("", 12)).pack()
canvas = tk.Canvas(root, height=400, width=600, bg='#29afbd')
canvas.pack()
frame = tk.Frame(root, bg='white')
frame.place(relwidth=0.8, relheight=0.8, relx=0.1, rely=0.1)
chose_image = tk.Button(root, text='Choose Image',
                        padx=35, pady=10,
                        fg="black", bg="pink", command=upload_img, activebackground="#add8e6")
chose_image.pack(side=tk.LEFT)

caption_image = tk.Button(root, text='Generate caption',
                        padx=35, pady=10,
                        fg="black", bg="pink", command=caption, activebackground="#add8e6")
caption_image.pack(side=tk.RIGHT)
link1 =tk.Label(root, text="CLICK HERE FOR AUDIO CAPTION ", fg="blue", cursor="hand2")
link1.pack()
link1.bind("<Button-1>", lambda e: callback(pathlib.Path(os.path.join(os.getcwd(),"C:\\Users\\Dell\\Desktop\\Dataset\\saved_caption_folder","welcome.mp3")).as_uri())) 


root.mainloop()

python 3.10.9 (tags/v3.10.9:1dd9be6, Dec  6 2022, 20:01:21) [MSC v.1934 64 bit (AMD64)]
keras version 2.10.0
tensorflow version 2.10.1

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14