In [1]:
import tensorflow as tf
import numpy as np
import gensim

2023-07-07 10:58:26.244277: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
hindi_kv = gensim.models.KeyedVectors.load('KeyedVectors/hindi_kv')
english_kv = gensim.models.KeyedVectors.load('KeyedVectors/english_kv')

2023-07-07 10:58:34,679 : INFO : loading KeyedVectors object from KeyedVectors/hindi_kv
2023-07-07 10:58:35,615 : INFO : loading vectors from KeyedVectors/hindi_kv.vectors.npy with mmap=None
2023-07-07 10:58:49,473 : INFO : KeyedVectors lifecycle event {'fname': 'KeyedVectors/hindi_kv', 'datetime': '2023-07-07T10:58:49.473450', 'gensim': '4.3.1', 'python': '3.10.12 (main, Jul  5 2023, 18:54:27) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'loaded'}
2023-07-07 10:58:49,474 : INFO : loading KeyedVectors object from KeyedVectors/english_kv
2023-07-07 10:58:49,790 : INFO : loading vectors from KeyedVectors/english_kv.vectors.npy with mmap=None
2023-07-07 10:58:53,394 : INFO : KeyedVectors lifecycle event {'fname': 'KeyedVectors/english_kv', 'datetime': '2023-07-07T10:58:53.394217', 'gensim': '4.3.1', 'python': '3.10.12 (main, Jul  5 2023, 18:54:27) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'loaded'}


In [4]:
MAX_SEQ_LEN = 22
BATCH_SIZE = 32

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
hindi_test_data = []
english_test_data = []
with open("Data/test_processed.hi") as f_input, open("Data/test_processed.en") as f_target:
    for input_line, target_line in zip(f_input, f_target):
        if input_line == "" or target_line == "":
            continue
        input_tokens = word_tokenize(input_line)
        target_tokens = word_tokenize(target_line)
        if len(input_tokens) > MAX_SEQ_LEN or len(target_tokens) > MAX_SEQ_LEN:
            continue
        hindi_test_data.append(input_line.strip())
        english_test_data.append(target_line.strip())

In [7]:
len(hindi_test_data)

61100

In [9]:
from keras import Input, Model
from keras.layers import LSTM, Bidirectional, Embedding, Concatenate, Dense, Attention, TimeDistributed
from keras.optimizers import Adam
tf.keras.backend.clear_session()

In [10]:
LATENT_DIM = 256

encoder_inputs = Input(shape=(MAX_SEQ_LEN,), name = "Encoder_Inputs")
enc_emb_layer = Embedding(input_dim=hindi_kv.vectors.shape[0], output_dim=hindi_kv.vectors.shape[1], weights=[hindi_kv.vectors], trainable = False, mask_zero = True, name = "Encoder_Embedding_Layer")
enc_emb = enc_emb_layer(encoder_inputs)
encoder_lstm = LSTM(LATENT_DIM, return_sequences = True, return_state=True, name = "Encoder_STM_Layer")

encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

concat_layer = Concatenate(name = "Concatenate_Layer")

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape = (MAX_SEQ_LEN,), name = "Decoder_Inputs")
dec_emb_layer = Embedding(input_dim=english_kv.vectors.shape[0], output_dim=english_kv.vectors.shape[1], weights=[english_kv.vectors], trainable = True, mask_zero = True, name = "Decoder_Embedding_Layer")
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name = "Decoder_LSTM_Layer")

decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = encoder_states)
decoder_attention_layer = Attention(name = "Attention_Layer")
attn_outputs = decoder_attention_layer([decoder_outputs, encoder_outputs])

attn_decoder_outputs = concat_layer([decoder_outputs, attn_outputs])

decoder_dense = TimeDistributed(Dense(english_kv.vectors.shape[0], activation='softmax'), name = "Dense_Layer")
final_outputs = decoder_dense(attn_decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], final_outputs)

2023-07-07 11:02:03.220629: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10766 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0001:00:00.0, compute capability: 3.7
2023-07-07 11:02:03.506762: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 600263200 exceeds 10% of free system memory.


In [11]:
model.load_weights('lstmModelWeights/model.h5')

2023-07-07 11:02:54.963282: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 600263200 exceeds 10% of free system memory.
2023-07-07 11:02:55.437910: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 819208192 exceeds 10% of free system memory.


In [13]:
#encoder model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

#inference decoder model
inf_decoder_input = Input(shape = (1,))
inf_decoder_input_emb = dec_emb_layer(inf_decoder_input)

inf_decoder_state_h = Input(shape = (LATENT_DIM,))
inf_decoder_state_c = Input(shape = (LATENT_DIM,))

inf_encoder_outputs = Input(shape = (MAX_SEQ_LEN, LATENT_DIM,))

inf_decoder_output, inf_state_h_output, inf_state_c_output = decoder_lstm(inf_decoder_input_emb, initial_state = [inf_decoder_state_h, inf_decoder_state_c])

inf_context_vector = decoder_attention_layer([inf_decoder_output, inf_encoder_outputs])
inf_output_with_attention = concat_layer([inf_decoder_output, inf_context_vector])

inf_decoder_output = decoder_dense(inf_output_with_attention)

inf_decoder_model = Model([inf_decoder_input, inf_decoder_state_h, inf_decoder_state_c, inf_encoder_outputs], [inf_decoder_output, inf_state_h_output, inf_state_c_output])

In [14]:
encoder_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Encoder_Inputs (InputLayer)  [(None, 22)]             0         
                                                                 
 Encoder_Embedding_Layer (Em  (None, 22, 200)          150065800 
 bedding)                                                        
                                                                 
 Encoder_STM_Layer (LSTM)    [(None, 22, 256),         467968    
                              (None, 256),                       
                              (None, 256)]                       
                                                                 
Total params: 150,533,768
Trainable params: 467,968
Non-trainable params: 150,065,800
_________________________________________________________________


In [15]:
inf_decoder_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 Decoder_Embedding_Layer (Embed  multiple            40000400    ['input_5[0][0]']                
 ding)                                                                                            
                                                                                                  
 input_6 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 256)]        0           []                         

In [16]:
def get_input_sequence(input_line):
    input_tokens = word_tokenize(input_line)
    input_tokens = ['<sos>'] + input_tokens + ['<eos>']
    for idx, token in enumerate(input_tokens):
        if token in hindi_kv:
            input_tokens[idx] = hindi_kv.key_to_index[token]
        else:
            input_tokens[idx] = hindi_kv.key_to_index['<unk>']        
    input_tokens = input_tokens + [hindi_kv.key_to_index['<pad>']] * (MAX_SEQ_LEN - len(input_tokens))
    input_tokens = np.expand_dims(input_tokens, axis = 0)
    return input_tokens

In [84]:
def get_predicted_sequence(input_seq):
    enc_out, s_h, s_c = encoder_model.predict(input_seq, verbose = 0)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = english_kv.key_to_index['<sos>']

    stop_flag = False
    predicted_seq = []

    while not stop_flag:
        dec_output, s_h, s_c = inf_decoder_model.predict([target_seq, s_h, s_c, enc_out], verbose = 0)

        max_idx = np.argmax(dec_output)
        predicted_seq.append(max_idx)

        target_seq[0, 0] = max_idx

        if max_idx == english_kv.key_to_index['<eos>'] or len(predicted_seq) == MAX_SEQ_LEN:
            stop_flag = True

    return predicted_seq
    
def get_english_translation(predicted_seq):
    translation = []
    for token in predicted_seq:
        if token == english_kv.key_to_index['<eos>']:
            break
        translation.append(english_kv.index_to_key[token])

    translation = " ".join(translation)

    return translation



In [80]:
list_of_references = english_test_data

In [81]:
input_sequences = [get_input_sequence(line) for line in hindi_test_data]

In [None]:
hypothesis = []
references = []
for idx, sequence in enumerate(input_sequences):
    hypothesis.append(get_english_translation(get_predicted_sequence(sequence)))
    references.append(english_test_data[idx])
    if idx % 500 == 0:
        print(idx)

In [223]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.translate.nist_score import corpus_nist

In [215]:
hypothesis1 = [line.split() for line in hypothesis]
references1 = [line.split() for line in references]

In [172]:
meteor_scores = []
for idx, _ in enumerate(hypothesis1):
    meteor_scores.append(meteor_score([references1[idx]], hypothesis1[idx]))

In [177]:
np.mean(meteor_scores)

0.30459389894137345

In [179]:
chencherry = SmoothingFunction()

In [212]:
bleu_score = corpus_bleu(references1, hypothesis1, smoothing_function = chencherry.method5, weights = (1, 0, 0, 0))

In [213]:
print(bleu_score)

0.24610628949157196
