In [1]:
!pwd

/content


In [3]:
%cd /content/drive/MyDrive/PROJECTS/VIDEO_CAPTIONING/TENSORFLOW_IMPLEMENTATION

/content/drive/MyDrive/PROJECTS/VIDEO_CAPTIONING/TENSORFLOW_IMPLEMENTATION


In [4]:
import os
import time
from google.colab.patches import cv2_imshow
import joblib
import numpy as np
import cv2
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model
import config

In [5]:
class VideoDescriptionRealTime(object):
    """
        Initialize the parameters for the model
        """
    def __init__(self, config):
        self.latent_dim = config.latent_dim
        self.num_encoder_tokens = config.num_encoder_tokens
        self.num_decoder_tokens = config.num_decoder_tokens
        self.time_steps_encoder = config.time_steps_encoder
        self.max_probability = config.max_probability

        # models
        self.encoder_model = None
        self.decoder_model = None
        self.inf_encoder_model = None
        self.inf_decoder_model = None
        self.save_model_path = config.save_model_path
        self.test_path = config.test_path
        self.search_type = config.search_type
        self.tokenizer = None
        self.num = 0

    def load_inference_models(self):
        # load tokenizer

        with open(os.path.join(self.save_model_path, 'tokenizer' + str(self.num_decoder_tokens)), 'rb') as file:
            self.tokenizer = joblib.load(file)

        # inference encoder model
        self.inf_encoder_model = load_model(os.path.join(self.save_model_path, 'encoder_model.h5'))

        # inference decoder model
        decoder_inputs = Input(shape=(None, self.num_decoder_tokens))
        decoder_dense = Dense(self.num_decoder_tokens, activation='softmax')
        decoder_lstm = LSTM(self.latent_dim, return_sequences=True, return_state=True)

        decoder_state_input_h = Input(shape=(self.latent_dim,))
        decoder_state_input_c = Input(shape=(self.latent_dim,))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)

        self.inf_decoder_model = Model(
            [decoder_inputs] + decoder_states_inputs,
            [decoder_outputs] + decoder_states)
        self.inf_decoder_model.load_weights(os.path.join(self.save_model_path, 'decoder_model_weights.h5'))


    def greedy_search(self, f):
        """
        :param f: the loaded numpy array after creating videos to frames and extracting features
        :return: the final sentence which has been predicted greedily
        """
        inv_map = self.index_to_word()
        states_value = self.inf_encoder_model.predict(f.reshape(-1, 80, 4096))
        target_seq = np.zeros((1, 1, 1500))
        final_sentence = ''
        target_seq[0, 0, self.tokenizer.word_index['bos']] = 1
        for i in range(100):
            output_tokens, h, c = self.inf_decoder_model.predict([target_seq] + states_value)
            states_value = [h, c]
            output_tokens = output_tokens.reshape(self.num_decoder_tokens)
            y_hat = np.argmax(output_tokens)
            if y_hat == 0:
                continue
            if inv_map[y_hat] is None:
                break
            if inv_map[y_hat] == 'eos':
                break
            else:
                final_sentence = final_sentence + inv_map[y_hat] + ' '
                target_seq = np.zeros((1, 1, 1500))
                target_seq[0, 0, y_hat] = 1
        return final_sentence

    def index_to_word(self):
        # inverts word tokenizer
        index_to_word = {value: key for key, value in self.tokenizer.word_index.items()}
        return index_to_word

    def get_test_data(self):
        # loads the features array
        file_list = os.listdir(os.path.join(self.test_path, 'video'))
        file_name = file_list[self.num]
        path = os.path.join(self.test_path, 'feat', file_name + '.npy')
        if os.path.exists(path):
            f = np.load(path)
        else:
            print('Wrong path given')
        if self.num < len(file_list):
            self.num += 1
        else:
            self.num = 0
        return f, file_name

    def test(self):
        X_test, filename = self.get_test_data()
        print("Video Filename = ",filename)
        # generate inference test outputs
        if self.search_type == 'greedy':
            sentence_predicted = self.greedy_search(X_test.reshape((-1, 80, 4096)))
        else:
            print('Other searching algorithm is not implemented yet')

        return sentence_predicted, filename

In [6]:
if __name__ == "__main__":
    video_to_text = VideoDescriptionRealTime(config)
    video_to_text.load_inference_models()
    while True:
        print('.........................\nGenerating Caption:\n')
        start = time.time()
        video_caption, file = video_to_text.test()
        end = time.time()
        sentence = ''
        print(sentence)
        for text in video_caption.split():
            sentence = sentence + ' ' + text
            print('\n.........................\n')
            print(sentence)
        print('\n.........................\n')
        print('\nFinal Caption generated for the video is : ',sentence)
        #print('\nIt took {:.2f} seconds to generate caption'.format(end-start))
        
        predict_next = input('\nShould I continue to generate caption for next videos ? Press y for yes & n for no ')
        if predict_next.lower() == 'y':
            continue
        elif predict_next.lower() == 'n':
            break
        else:
            print('Could not understand type (y) for yes and (n) for no')
            continue

.........................
Generating Caption:

Video Filename =  dfOuTx66bJU_34_39.avi


.........................

 a

.........................

 a man

.........................

 a man is

.........................

 a man is walking

.........................

 a man is walking in

.........................

 a man is walking in the

.........................

 a man is walking in the rain

.........................


Final Caption generated for the video is :   a man is walking in the rain

Should I continue to generate caption for next videos ? Press y for yes & n for no y
.........................
Generating Caption:

Video Filename =  glrijRGnmc0_211_215.avi


.........................

 a

.........................

 a man

.........................

 a man is

.........................

 a man is eating

.........................

 a man is eating a

.........................

 a man is eating a of

.........................

 a man is eating a of of

.......................