<a href="https://colab.research.google.com/github/VJsai45/English-to-Hindi-Transliteration/blob/main/SequenceModelAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from numpy import array
from numpy import argmax
from numpy import array_equal
from random import randint
import pandas as pd


In [25]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
# Download Dakshina dataset

In [None]:
!tar -xvf 'dakshina_dataset_v1.0.tar' -C '/content/drive/MyDrive/Deep Learning Data/Dakshina Dataset'
# untar and store data

In [26]:
all_hindi_eng_data = pd.read_csv('/content/drive/MyDrive/Deep Learning Data/Dakshina Dataset/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv',names=['all'])
# read data into dataframe

In [27]:
all_hindi_eng_data

Unnamed: 0,all
0,अं\tan\t3
1,अंकगणित\tankganit\t3
2,अंकल\tuncle\t4
3,अंकुर\tankur\t4
4,अंकुरण\tankuran\t3
...,...
44199,ह्वेनसांग\thiuentsang\t1
44200,ह्वेनसांग\thsuantsang\t1
44201,ह्वेनसांग\thyensang\t1
44202,ह्वेनसांग\txuanzang\t1


In [28]:
split_data = all_hindi_eng_data["all"].str.split("\t",n=2,expand=True)
# split the eng words, hindi words and length data

In [29]:
X_train_english = split_data[1]
y_train_hindi = split_data[0]
X_train_english, y_train_hindi
# read them into series

(0                an
 1          ankganit
 2             uncle
 3             ankur
 4           ankuran
             ...    
 44199    hiuentsang
 44200    hsuantsang
 44201      hyensang
 44202      xuanzang
 44203            om
 Name: 1, Length: 44204, dtype: object, 0               अं
 1          अंकगणित
 2             अंकल
 3            अंकुर
 4           अंकुरण
            ...    
 44199    ह्वेनसांग
 44200    ह्वेनसांग
 44201    ह्वेनसांग
 44202    ह्वेनसांग
 44203            ॐ
 Name: 0, Length: 44204, dtype: object)

In [30]:
X_train_english_array = np.array(X_train_english)
X_train_english_array
y_train_hindi_array = np.array(y_train_hindi)
y_train_hindi_array
# convert to arrays

array(['अं', 'अंकगणित', 'अंकल', ..., 'ह्वेनसांग', 'ह्वेनसांग', 'ॐ'],
      dtype=object)

In [31]:
X_train_english_chars = X_train_english.apply(lambda x: list(x)) 

In [32]:
y_train_hindi_chars = y_train_hindi.apply(lambda x: list(x))

In [33]:
input_words = X_train_english_array.copy()
target_words = [y+'>' for y in y_train_hindi_array] # add start token in target words
target_shifted_words = ['<'+y for y in y_train_hindi_array] # add end token in shifted target words(decoder input)


In [34]:
eng_chars = []
for word in input_words:
  for char in word:
    if char not in eng_chars:
      eng_chars.append(char)
eng_chars = sorted(eng_chars)
# create english characters dictionary

In [35]:
hindi_chars = ['<']
for word in target_words:
  for char in word:
    if char not in hindi_chars:
      hindi_chars.append(char)
hindi_chars = sorted(hindi_chars)
# create hindi characters dictionary

In [36]:
no_of_encoder_tokens = len(eng_chars)
no_of_decoder_tokens = len(hindi_chars)
max_encoder_sequence_length = max([len(word) for word in input_words])
max_decoder_sequence_length = max([len(word) for word in target_words])
# calculate number of encoder/decoder token and max encoder/deocder word length

In [37]:
input_token_index = dict([(char, i) for i, char in enumerate(eng_chars)])
target_token_index = dict([(char, i) for i, char in enumerate(hindi_chars)])
# create input and target character dictionaries

In [38]:
reverse_input_token_index = dict([(i, char) for i, char in enumerate(eng_chars)])
reverse_output_token_index = dict([(i, char) for i, char in enumerate(hindi_chars)])
# create reverse input and target character dictionaries while decoding

In [39]:
encoder_input_data = np.zeros((len(input_words),max_encoder_sequence_length,no_of_encoder_tokens),dtype='float32')
decoder_input_data = np.zeros((len(target_shifted_words),max_decoder_sequence_length,no_of_decoder_tokens),dtype='float32')
decoder_output_data = np.zeros((len(target_words),max_decoder_sequence_length,no_of_decoder_tokens),dtype='float32')
# create encoder/decoder input/data arrays

In [40]:
decoder_input_data.shape

(44204, 20, 65)

In [41]:
for i,(input_word,target_shifted_word,target_word) in enumerate(zip(input_words,target_shifted_words,target_words)):
  for t,char in enumerate(input_word):
    encoder_input_data[i,t,input_token_index[char]] = '1.0'
  for t,char in enumerate(target_shifted_word):
    decoder_input_data[i,t,target_token_index[char]] = '1.0'
  for t,char in enumerate(target_word):
    decoder_output_data[i,t,target_token_index[char]] = '1.0'

# one hot encode the data and fill up the arrays

In [42]:
# returns train, inference_encoder and inference_decoder models
def define_models(n_input, n_output, n_units):
	# define training encoder
	encoder_inputs = Input(shape=(None, n_input))
	encoder = LSTM(n_units, return_state=True)
	encoder_outputs, state_h, state_c = encoder(encoder_inputs)
	encoder_states = [state_h, state_c]
	# define training decoder
	decoder_inputs = Input(shape=(None, n_output))
	decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
	decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
	decoder_dense = Dense(n_output, activation='softmax')
	decoder_outputs = decoder_dense(decoder_outputs)
	model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
	# define inference encoder
	encoder_model = Model(encoder_inputs, encoder_states)
	# define inference decoder
	decoder_state_input_h = Input(shape=(n_units,))
	decoder_state_input_c = Input(shape=(n_units,))
	decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
	decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
	decoder_states = [state_h, state_c]
	decoder_outputs = decoder_dense(decoder_outputs)
	decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
	# return all models
	return model, encoder_model, decoder_model


In [43]:
latent_dimensions = 256

In [44]:
training_model, inference_encoder, inference_decoder = define_models(no_of_encoder_tokens, no_of_decoder_tokens, latent_dimensions)

In [45]:
training_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [46]:
def predict_word(input):
  one_hot_input = np.zeros((1,max_encoder_sequence_length,no_of_encoder_tokens),dtype="float32")
  for i,char in enumerate(input):
    one_hot_input[0,i,input_token_index[char]] = '1.0'
  encoder_output_state = inference_encoder.predict(one_hot_input)
  # encode the input
  start_token = np.zeros((1,1,no_of_decoder_tokens),dtype="float32")
  # create the start token for decoder
  start_token[0,0,target_token_index['<']] = '1.0'
  decoded_word = ""
  decoded_char = ''
  decoder_states = encoder_output_state
  decoder_input = start_token
  count = 0
  stop = False
  while not stop:
    # copy the encoder op state to decoder state
    decoder_output,decoder_state_hidden,decoder_state_cell = inference_decoder.predict(
       [decoder_input] + decoder_states 
    )
    # pass the start token and states to decoder
    decoded_char_index = np.argmax(decoder_output[0,-1,:])
    decoded_char = reverse_output_token_index[decoded_char_index]
    decoded_word += decoded_char
    # deocde the char and append to the word
    decoder_input = np.zeros((1,1,no_of_decoder_tokens),dtype="float32")
    decoder_input[0,0,target_token_index[decoded_char]] = '1.0'
    # update the decoder input
    decoder_states = [decoder_state_hidden,decoder_state_cell]
    count = count + 1
    if (decoded_char == '>' or count > 10):
      stop = True
    # update the decoder states
  return decoded_word



In [47]:
predict_word('hello')

'गगओओणैइैववई'

In [None]:
training_model.fit([encoder_input_data,decoder_input_data],decoder_output_data,
    batch_size=64,
    epochs=20,
    validation_split=0.2,)

Epoch 1/20
102/553 [====>.........................] - ETA: 1:47 - loss: 1.2575 - accuracy: 0.0639

# Number Sequence Encoder Decoder


In [None]:
# generate a sequence of random integers
def generate_sequence(length, n_unique):
	return [randint(1, n_unique-1) for _ in range(length)]

In [None]:
def predict_sequence(inferenceEncoder,inferenceDecoder,source_input,size_of_output,no_of_features):
  # encode the source_input
  state = inferenceEncoder.predict(source_input) # final state of encoder and initial state of decoder
  # start of sequence input(<START> token)
  target_seq = array([0.0 for _ in range(51)]).reshape(1, 1, 51)
  output = list()
  for t in range(size_of_output):
    # pass the start of seq and  state of decoder to get deocder output and states
    yDecoder,h,c = inferenceDecoder.predict([target_seq] + state)
    # build the output
    output.append(yDecoder[0,0,:])
    # update the state with new decoder state
    state = [h,c]
    # update the target seq
    target_seq = yDecoder
  return array(output)

In [None]:
argmax(array([0,1,0.1,0.2,0.8]))

1

In [None]:
# To decode the number series
def one_hot_decode(encoded_seq):
	return [argmax(vector) for vector in encoded_seq]


In [None]:
# prepare data for the LSTM
def get_dataset(n_in, n_out, cardinality, n_samples):
	X1, X2, y = list(), list(), list()
	for _ in range(n_samples):
		# generate source sequence
		source = generate_sequence(n_in, cardinality)
		# define padded target sequence
		target = source.copy()
		target.reverse()
		# create padded input target sequence
		target_in = [0] + target[:-1]
		# encode
		src_encoded = to_categorical([source], num_classes=cardinality).reshape(len(source),cardinality)
		tar_encoded = to_categorical([target], num_classes=cardinality).reshape(len(target),cardinality)
		tar2_encoded = to_categorical([target_in], num_classes=cardinality).reshape(len(target_in),cardinality)
		# store
		X1.append(src_encoded)
		X2.append(tar2_encoded)
		y.append(tar_encoded)
	return array(X1), array(X2), array(y)

In [None]:
# configure problem
n_features = 50 + 1
n_steps_in = 6
n_steps_out = 6

In [None]:
# define model
train, infenc, infdec = define_models(n_features, n_features, 256)
train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# generate training dataset
X1, X2, y = get_dataset(n_steps_in, n_steps_out, n_features, 100000)



In [None]:
source = generate_sequence(10, 51)
to_categorical([source], num_classes=51).reshape(10,51).shape


(10, 51)

In [None]:
# train model
train.fit([X1, X2], y, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f5b1aae5810>

In [None]:
total, correct = 100, 0
for _ in range(total):
	X1, X2, y = get_dataset(n_steps_in, n_steps_out, 51, 1)
	target = predict_sequence(infenc, infdec, X1, n_steps_out, 51)
	if array_equal(one_hot_decode(y[0]), one_hot_decode(target)):
		correct += 1
print('Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))

Accuracy: 97.00%


In [None]:
# spot check some examples
for _ in range(10):
	X1, X2, y = get_dataset(n_steps_in, n_steps_out, n_features, 1)
	target = predict_sequence(infenc, infdec, X1, n_steps_out, n_features)
	print('X=%s y=%s, yhat=%s' % (one_hot_decode(X1[0]), one_hot_decode(y[0]), one_hot_decode(target)))