<a href="https://colab.research.google.com/github/RohanS2003/notabot/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing the necessary modules
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
# Loading the VGG16
model1 = VGG16()

#Changing the model: Removing the predicted values from the existing VGG16 model
model1 = Model(inputs=model1.inputs, outputs=model1.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
BASE_DIR=''
directory = ''
features = {}
#directory = os.path.join(BASE_DIR, 'frames')

for i in tqdm(os.listdir(directory)):
    img_path = directory + '/' + i
    image = load_img(img_path, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model1.predict(image, verbose=0)
    image_id = i.split('.')[0]
    features[image_id] = feature

#pickle.dump(features, open(os.path.join(BASE_DIR, 'features.pkl'), 'wb'))

In [None]:
#Mapping the descriptions to the images
mapping = {}
for each_desc in tqdm(desc_doc.split('\n')):
    tokens = each_desc.split(',')
    if len(each_desc) < 2:
        continue
    image_id, desc_of = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    desc_of = " ".join(desc_of)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(desc_of)

In [None]:
# Editing the descriptions: Convert to lower case and add beginning and ending
def edit_description(mapping):
    for key, desc in mapping.items():
        for i in range(len(desc)):
            x = desc[i]
            x = x.lower()
            x = x.replace('[^A-Za-z]', '')
            x = x.replace('\s+', ' ')
            x = 'beginning ' + " ".join([word for word in x.split() if len(word)>1]) + ' ending'
            desc[i] = x

In [None]:
# Calling the preprocessing text function
edit_description(mapping)

In [None]:
# Appending all descriptions into a list: Each image with 5 descriptions
img_desc = []
for key in mapping:
    for caption in mapping[key]:
        img_desc.append(caption)

In [None]:
# Tokenizing the text: finding the unique words from all the captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(img_desc)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Get the maximum description length for the padding required
max_length = max(len(text.split()) for text in img_desc)

In [None]:
# Splitting the dataset into Training and Testing: 90% is given to training and remaining is for the test
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
# Generating the data frm the inputs of images and descriptions and passing it for the model
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            text = mapping[key]
            for t in text:
                seq = tokenizer.texts_to_sequences([t])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

In [None]:
# Giving the inputs for the CNN

inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
#Training the model with 20 epochs
epochs = 20
batch_size = 32
steps = len(train) // batch_size

for i in range(epochs):
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

model.save('best_model.h5')

In [None]:
def mapping_toword(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def predict_description(model, image, tokenizer, max_length):
    in_text = 'beginning'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)
        desc_predict = model.predict([image, sequence], verbose=0)

        desc_predict = np.argmax(desc_predict)
        word = mapping_toword(desc_predict, tokenizer)
        if word is None:
            break
        in_text += " " + word
        if word == 'ending':
            break
      
    return in_text

In [None]:
actual, predicted = list(), list()

for key in tqdm(test):
    desc = mapping[key]
    y_pred = predict_description(model, features[key], tokenizer, max_length) 
    actual_desc = [caption.split() for text in desc]
    y_pred = y_pred.split()
    actual.append(actual_desc)
    predicted.append(y_pred)

In [None]:
!pip install pyttsx3

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_text(image_name):
    image_id = image_name.split('.')[0]
    img_path = os.path.join(BASE_DIR, "Images", image_name)
    image = Image.open(img_path)
    desc = mapping[image_id]
    y_pred = predict_description(model, features[image_id], tokenizer, max_length)
    plt.imshow(image)

    return str(y_pred)

In [None]:
# Installing the required modules

!pip3 install gTTS pyttsx3 playsound pygobject

In [None]:
pip install SpeechRecognition

In [None]:
# Setting up the engine for voice to text for input commands
import os
import speech_recognition as sr
import pyttsx3

def voice_output(command):
    engine = pyttsx3.init()
    engine.say(command)
    engine.runAndWait()
r = sr.Recognizer()
x = 0


In [None]:
!pip install gTTs

In [None]:
# load the trained model
from tensorflow import keras
#model = keras.models.load_model('/path/to/trained/model.h5')

# load the image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
image_path = '/kaggle/input/flickr8k/Images/1032460886_4a598ed535.jpg'
image = load_img(image_path, target_size=(224, 224))
image = img_to_array(image)
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
image = preprocess_input(image)

# extract features using VGG16 model
feature_vector = model1.predict(image, verbose=0)

# generate the caption
'''def predict_caption(model, tokenizer, feature_vector, max_length):
    in_text = 'beginning'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([feature_vector,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = mapping_toword(yhat, tokenizer)
        in_text += ' ' + word
        if word == 'ending':
            break
    return in_text'''

def predict_caption(model, tokenizer, feature_vector, max_length):
    in_text = 'beginning'
    for i in range(max_length):
        sequence = tokenizer.encode(in_text)
        sequence = np.array(sequence).reshape(1,-1)
        yhat = model.predict([feature_vector,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.decode([yhat])
        in_text += ' ' + word
        if word == 'ending':
            break
    return in_text


caption = predict_caption(model, tokenizer, feature_vector, max_length)


In [None]:
from gtts import gTTS
from IPython.display import Audio
from PIL import Image
import matplotlib.pyplot as plt

print(caption)

image = Image.open(image_path)
plt.imshow(image)

res = caption.split(' ', 1)[1]
text = res.rsplit(' ', 1)[0]

tts = gTTS(text) 

tts.save('info.wav')
sound_file = 'info.wav'
Audio(sound_file, autoplay=True)