In [8]:
import xml.etree.ElementTree as ET 
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [9]:

# Load the data from the XML file
def load_data(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    input_texts = []
    target_texts = []
    for i, item in enumerate(root):
        input_text = item[0].text
        target_text = item[1].text
        input_texts.append(input_text)
        target_texts.append(target_text)
    return input_texts, target_texts

# Tokenize the text data
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

# Load and tokenize the data
input_texts, target_texts = load_data('NEWS2018_M-EnHi_trn.xml')
input_tensor, inp_lang_tokenizer = tokenize(input_texts)
target_tensor, targ_lang_tokenizer = tokenize(target_texts)

# Calculate max length of input and target tensors
max_length_inp, max_length_tar = input_tensor.shape[1], target_tensor.shape[1]

# Convert the target tensor to have the expected shape
target_tensor = tf.keras.utils.to_categorical(target_tensor, num_classes=len(targ_lang_tokenizer.word_index) + 1)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_inp, padding='post')

# Create the LSTM model
def create_model(inp_vocab_size, tar_vocab_size, max_length_inp, max_length_tar):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(inp_vocab_size, 150, input_length=max_length_inp))
    model.add(tf.keras.layers.LSTM(150, return_sequences=True))
    model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tar_vocab_size, activation='softmax')))
    return model


model = create_model(len(inp_lang_tokenizer.word_index) + 1, len(targ_lang_tokenizer.word_index) + 1, max_length_inp, max_length_tar)






In [26]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
history = model.fit(input_tensor, target_tensor, epochs=45, batch_size=128, validation_split=0.2)

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


- Model's Accuracy on Training data is 
- Loss on Training data is 

In [28]:
# Saving the model

model.save('LSTM-wordTranslator.h5')


## Evaluating on Test Data

In [10]:
# # Loading the Model
from tensorflow.keras.models import load_model

model = load_model('LSTM-wordTranslator.h5')

## Calculating Model's loss and Accuray on Testing data

In [11]:
# Load and tokenize the test data
input_texts_test, target_texts_test = load_data('NEWS2018_M-EnHi_dev.xml')
input_tensor_test, input_lang_tokenizer = tokenize(input_texts_test)
target_tensor_test, targ_lang_tokenizer = tokenize(target_texts_test)

# Get the number of unique words in the target language
tar_vocab_size = len(targ_lang_tokenizer.word_index) + 1

# Convert the target tensor to have the expected shape
final_layer_output_shape = model.layers[-1].output_shape
target_tensor_test = tf.keras.preprocessing.sequence.pad_sequences(target_tensor_test, maxlen=final_layer_output_shape[1], padding='post')
target_tensor_test = tf.keras.utils.to_categorical(target_tensor_test, num_classes=final_layer_output_shape[2])

# Pad the input tensor to have the expected shape
input_tensor_test = tf.keras.preprocessing.sequence.pad_sequences(input_tensor_test, maxlen=final_layer_output_shape[1], padding='post')

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(input_tensor_test, target_tensor_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)



Test loss: 1.7046736478805542
Test accuracy: 0.8644999861717224


- Loss is 1.32
- Models accuracy is 

## How the Test file looks 

In [30]:
# Test file Input & Target
input_texts_decoded = []
for i in range(input_tensor_test.shape[0]):
    input_text = input_lang_tokenizer.sequences_to_texts([input_tensor_test[i]])
    input_texts_decoded.append(input_text[0])

# Decode the target tensor to see the target strings
target_texts_decoded = []
for i in range(target_tensor_test.shape[0]):
    target_text = targ_lang_tokenizer.sequences_to_texts([np.argmax(target_tensor_test[i], axis=1)])
    target_texts_decoded.append(target_text[0])

# Print the decoded input and target strings
for i in range(input_tensor_test.shape[0]):
    print("Input:", input_texts_decoded[i])
    print("Target",target_texts_decoded[i] )

Input: aachaaryanandana
Target आचार्यनंदना
Input: aachaarysut
Target आचार्यसुत
Input: aacharynandan
Target आचार्यनंदन
Input: aacharynandanaa
Target आचार्यनंदना
Input: aadamkhor
Target आदमखोर
Input: aag aur shola
Target आग और शोला
Input: aaj ki baat
Target आज की बात
Input: aaj ki taza khabar
Target आज की ताज़ा खबर
Input: aakanksha
Target आकांक्षा
Input: aan
Target आन
Input: aankhon dekhi
Target आँखों देखी
Input: aansoo ban gaye phool
Target आँसू बन गए फ़ूल
Input: aashaa
Target आशा
Input: aatank
Target आतंक
Input: ab kya hoga
Target अब क्या होगा
Input: abdul haakim
Target अब्दुल हाकिम
Input: abdul hannan
Target अब्दुल हन्नान
Input: abdul muhasin
Target अब्दुल मुहासिन
Input: abdul muiz
Target अब्दुल मुईज़
Input: abdul wakil
Target अब्दुल वकील
Input: aberdeen
Target एबरडीन
Input: abha
Target आभा
Input: abhiman
Target अभिमान
Input: abhinanda
Target अभिनन्दा
Input: abisso
Target एबीसो
Input: abu mansur daqiqi
Target अबू मंसूर दक़ीक़ी
Input: abul fazal
Target अबुल फजल
Input: acharysuta
Target आचार

## Model's prediction output on Test data
### Decoding the input_tensor_test & target_tensor_test to see the output generated by the Model 

In [12]:
# Inverse map the index to the word using the tokenizer
input_word_index = input_lang_tokenizer.word_index
input_index_word = dict([(index, word) for word, index in input_word_index.items()])

target_word_index = targ_lang_tokenizer.word_index
target_index_word = dict([(index, word) for word, index in target_word_index.items()])

# Get the predictions for the test data
predictions = model.predict(input_tensor_test)

# Function to convert the index to the word
def index_to_word(index):
  return target_index_word.get(index, '')

# Function to convert the prediction to the text
def prediction_to_text(prediction):
  text = ''
  for i in range(prediction.shape[0]):
    index = np.argmax(prediction[i])
    word = index_to_word(index)
    if word == 'endseq':
      return text
    text += word + ' '
  return text

# Get the predicted texts for the test data
predicted_texts = []
for i in range(predictions.shape[0]):
  prediction = predictions[i, :, :]
  text = prediction_to_text(prediction)
  predicted_texts.append(text)

# Print the input text, target text, and predicted text for 10 samples
for i in range(len(input_texts_test)):
  print("Input Text:", input_texts_test[i])
  #print("Target Text:", target_texts_test[i])
  print("Predicted Text:", predicted_texts[i])
  print()


Input Text: aachaaryanandana
Predicted Text: मेरी          

Input Text: aachaarysut
Predicted Text: सिटी          

Input Text: aacharynandan
Predicted Text: तूफान          

Input Text: aacharynandanaa
Predicted Text: फिफ्टी          

Input Text: aadamkhor
Predicted Text: सेंट          

Input Text: aag aur shola
Predicted Text: सन और शोएब        

Input Text: aaj ki baat
Predicted Text: मेरा अवार्ड आशा        

Input Text: aaj ki taza khabar
Predicted Text: मेरा अवार्ड आचार्यसुत        

Input Text: aakanksha
Predicted Text: चोर          

Input Text: aan
Predicted Text: एबीसो          

Input Text: aankhon dekhi
Predicted Text: प्रो ताज़ा         

Input Text: aansoo ban gaye phool
Predicted Text:   खबर        

Input Text: aashaa
Predicted Text: स्वान          

Input Text: aatank
Predicted Text: आन          

Input Text: ab kya hoga
Predicted Text: आँखों देखी आँसू        

Input Text: abdul haakim
Predicted Text: अब्दुल बन         

Input Text: abdul hannan
Predicted Text: अब्दुल

## Predicting a Word given by User Manually

In [17]:
# Inverse map the index to the word using the tokenizer
input_word_index = input_lang_tokenizer.word_index
input_index_word = dict([(index, word) for word, index in input_word_index.items()])

target_word_index = targ_lang_tokenizer.word_index
target_index_word = dict([(index, word) for word, index in target_word_index.items()])

# Function to convert the index to the word
def index_to_word(index):
  return target_index_word.get(index, '')

# Function to convert the prediction to the text
def prediction_to_text(prediction):
  text = ''
  for i in range(prediction.shape[0]):
    index = np.argmax(prediction[i])
    word = index_to_word(index)
    if word == 'endseq':
      return text
    text += word + ' '
  return text

# Get manual input from the user
t = int(input("Enter number of words you want to predict"))

print("Model Predictions: ")

for i in range(t):
    input_text = input("Enter a word: ")


# Convert the manual input to a tensor
    input_tensor = input_lang_tokenizer.texts_to_sequences([input_text])
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=input_tensor_test.shape[0], padding='post')
    input_tensor = np.array(input_tensor)

# Get the prediction for the manual input
    prediction = model.predict(input_tensor)



# # Get the predicted text for the manual input
    predicted_text = prediction_to_text(prediction[0])

# # Print the manual input and the predicted text
    print("Input Text:", input_text)
    print("Predicted Text:", predicted_text)





Enter number of words you want to predict2
Model Predictions: 
Enter a word: daku
Input Text: daku
Predicted Text: डाकू          
Enter a word: guru
Input Text: guru
Predicted Text: गुरूद्वारा          
