In [1]:
import xml.etree.ElementTree as ET  #5
import numpy as np
import tensorflow as tf

# Load the data from the XML file
def load_data(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    input_texts = []
    target_texts = []
    for i, item in enumerate(root):
        input_text = item[0].text
        target_text = item[1].text
        input_texts.append(input_text)
        target_texts.append(target_text)
    return input_texts, target_texts

# Tokenize the text data
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

# Load and tokenize the data
input_texts, target_texts = load_data('NEWS2018_M-EnHi_trn.xml')
input_tensor, inp_lang_tokenizer = tokenize(input_texts)
target_tensor, targ_lang_tokenizer = tokenize(target_texts)

# Calculate max length of input and target tensors
max_length_inp, max_length_tar = input_tensor.shape[1], target_tensor.shape[1]

# Convert the target tensor to have the expected shape
target_tensor = tf.keras.utils.to_categorical(target_tensor, num_classes=len(targ_lang_tokenizer.word_index) + 1)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_inp, padding='post')

# Create the RNN model
def create_model(inp_vocab_size, tar_vocab_size, max_length_inp, max_length_tar):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(inp_vocab_size, 128, input_length=max_length_inp))
    model.add(tf.keras.layers.GRU(128, return_sequences=True))
    model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(tar_vocab_size, activation='softmax')))
    return model


model = create_model(len(inp_lang_tokenizer.word_index) + 1, len(targ_lang_tokenizer.word_index) + 1, max_length_inp, max_length_tar)






In [2]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
history = model.fit(input_tensor, target_tensor, epochs=40, batch_size=128, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


- Model's Accuracy on Training data is 99%
- Loss on Training data is 0.067

In [3]:
# Saving the model

model.save('RNN-wordTranslator.h5')

## Evaluating on Test Data

In [4]:
# # Loading the Model
from tensorflow.keras.models import load_model

model = load_model('RNN-wordTranslator.h5')

## Calculating Model's loss and Accuray on Testing data

In [5]:
# Load and tokenize the test data
input_texts_test, target_texts_test = load_data('NEWS2018_M-EnHi_dev.xml')
input_tensor_test, input_lang_tokenizer = tokenize(input_texts_test)
target_tensor_test, targ_lang_tokenizer = tokenize(target_texts_test)

# Get the number of unique words in the target language
tar_vocab_size = len(targ_lang_tokenizer.word_index) + 1

# Convert the target tensor to have the expected shape
final_layer_output_shape = model.layers[-1].output_shape
target_tensor_test = tf.keras.preprocessing.sequence.pad_sequences(target_tensor_test, maxlen=final_layer_output_shape[1], padding='post')
target_tensor_test = tf.keras.utils.to_categorical(target_tensor_test, num_classes=final_layer_output_shape[2])

# Pad the input tensor to have the expected shape
input_tensor_test = tf.keras.preprocessing.sequence.pad_sequences(input_tensor_test, maxlen=final_layer_output_shape[1], padding='post')

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(input_tensor_test, target_tensor_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)



Test loss: 1.782827377319336
Test accuracy: 0.8644000291824341


- Loss on Testing data is 1.78
- Models accuracy on testing data is 0.8644

## Model's prediction output on Test data

In [7]:
# Inverse map the index to the word using the tokenizer
input_word_index = input_lang_tokenizer.word_index
input_index_word = dict([(index, word) for word, index in input_word_index.items()])

target_word_index = targ_lang_tokenizer.word_index
target_index_word = dict([(index, word) for word, index in target_word_index.items()])

# Get the predictions for the test data
predictions = model.predict(input_tensor_test)

# Function to convert the index to the word
def index_to_word(index):
  return target_index_word.get(index, '')

# Function to convert the prediction to the text
def prediction_to_text(prediction):
  text = ''
  for i in range(prediction.shape[0]):
    index = np.argmax(prediction[i])
    word = index_to_word(index)
    if word == 'endseq':
      return text
    text += word + ' '
  return text

# Get the predicted texts for the test data
predicted_texts = []
for i in range(predictions.shape[0]):
  prediction = predictions[i, :, :]
  text = prediction_to_text(prediction)
  predicted_texts.append(text)

# Print the input text, target text, and predicted text for 10 samples
for i in range(len(input_texts_test)):
  print("Input Text:", input_texts_test[i])
  print("Target Text:", target_texts_test[i])
  print("Predicted Text:", predicted_texts[i])
  print()


Input Text: aachaaryanandana
Target Text: आचार्यनंदना
Predicted Text: मेरी          

Input Text: aachaarysut
Target Text: आचार्यसुत
Predicted Text: सिटी          

Input Text: aacharynandan
Target Text: आचार्यनंदन
Predicted Text: तूफान          

Input Text: aacharynandanaa
Target Text: आचार्यनंदना
Predicted Text: फिफ्टी          

Input Text: aadamkhor
Target Text: आदमखोर
Predicted Text: सेंट          

Input Text: aag aur shola
Target Text: आग और शोला
Predicted Text: सन और शोएब        

Input Text: aaj ki baat
Target Text: आज की बात
Predicted Text: मेरा अवार्ड आशा        

Input Text: aaj ki taza khabar
Target Text: आज की ताज़ा खबर
Predicted Text: मेरा अवार्ड आचार्यसुत आचार्यनंदन       

Input Text: aakanksha
Target Text: आकांक्षा
Predicted Text: चोर          

Input Text: aan
Target Text: आन
Predicted Text: एबीसो          

Input Text: aankhon dekhi
Target Text: आँखों देखी
Predicted Text: प्रो ताज़ा         

Input Text: aansoo ban gaye phool
Target Text: आँसू बन गए फ़ूल
Predicted Tex