In [5]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Input, LSTM, GRU, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ------------------------
# Custom Config
# ------------------------
embed_sz = 32
units = 64
depth = 1
cell_kind = 'LSTM'  # 'GRU' or 'RNN'

# ------------------------
# Read Data from the TSV file
# ------------------------
file_path = 'hi.translit.sampled.train.tsv'

# Load data using pandas
df = pd.read_csv(file_path, sep='\t', header=None, names=['Hindi', 'Transliteration', 'Frequency'])

# Drop rows with any missing values
df = df.dropna(subset=['Hindi', 'Transliteration'])

# Convert to string (in case there are numbers/floats)
hin = df['Hindi'].astype(str).values
eng = df['Transliteration'].astype(str).values


# ------------------------
# Tokenization
# ------------------------
def tokenize(char_list):
    chars = sorted(set(''.join(char_list)))
    c2i = {c: i+1 for i, c in enumerate(chars)}
    i2c = {i: c for c, i in c2i.items()}
    return c2i, i2c

src_vocab, rev_src = tokenize(eng)
tgt_vocab, rev_tgt = tokenize(hin)

src_vocab_len = len(src_vocab) + 1
tgt_vocab_len = len(tgt_vocab) + 1

# ------------------------
# Encode and Pad Sequences
# ------------------------
def encode_sequence(word_list, mapper):
    return [[mapper[ch] for ch in word] for word in word_list]

src_encoded = encode_sequence(eng, src_vocab)
tgt_encoded = encode_sequence(hin, tgt_vocab)

sos_token = tgt_vocab_len
eos_token = tgt_vocab_len + 1

# Padding sequences to handle varying lengths
dec_input = pad_sequences([[sos_token] + seq for seq in tgt_encoded], padding='post')
dec_output = pad_sequences([seq + [eos_token] for seq in tgt_encoded], padding='post')
dec_output = np.expand_dims(dec_output, -1)
src_input = pad_sequences(src_encoded, padding='post')

# ------------------------
# RNN Selector
# ------------------------
def make_rnn(units, name, return_sequences=False, return_state=True):
    if cell_kind == 'GRU':
        return GRU(units, name=name, return_sequences=return_sequences, return_state=return_state)
    elif cell_kind == 'RNN':
        return SimpleRNN(units, name=name, return_sequences=return_sequences, return_state=return_state)
    else:
        return LSTM(units, name=name, return_sequences=return_sequences, return_state=return_state)

# ------------------------
# Model Assembly
# ------------------------
enc_input_layer = Input(shape=(None,), name="src_input")
enc_embed = Embedding(input_dim=src_vocab_len, output_dim=embed_sz, mask_zero=True, name="src_embed")(enc_input_layer)

# Encoder Stack
enc_out = enc_embed
states = []
for layer_num in range(depth):
    rnn = make_rnn(units, name=f"enc_rnn_{layer_num}")
    if cell_kind == 'LSTM':
        enc_out, state_h, state_c = rnn(enc_out)
        states = [state_h, state_c]
    else:
        enc_out, state_h = rnn(enc_out)
        states = [state_h]

dec_input_layer = Input(shape=(None,), name="tgt_input")
dec_embed = Embedding(input_dim=tgt_vocab_len + 2, output_dim=embed_sz, mask_zero=True, name="tgt_embed")(dec_input_layer)

# Decoder Stack
dec_out = dec_embed
for layer_num in range(depth):
    rnn = make_rnn(units, name=f"dec_rnn_{layer_num}", return_sequences=True)
    if cell_kind == 'LSTM':
        dec_out, _, _ = rnn(dec_out, initial_state=states)
    else:
        dec_out, _ = rnn(dec_out, initial_state=states)

final_dense = Dense(tgt_vocab_len + 2, activation='softmax', name="out_layer")(dec_out)

model = Model(inputs=[enc_input_layer, dec_input_layer], outputs=final_dense)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ------------------------
# Train
# ------------------------
model.fit([src_input, dec_input], dec_output, batch_size=2, epochs=10)


Epoch 1/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 15ms/step - accuracy: 0.1506 - loss: 2.1486
Epoch 2/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 16ms/step - accuracy: 0.2708 - loss: 0.7987
Epoch 3/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 15ms/step - accuracy: 0.2893 - loss: 0.6328
Epoch 4/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 15ms/step - accuracy: 0.2965 - loss: 0.5646
Epoch 5/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 15ms/step - accuracy: 0.3013 - loss: 0.5209
Epoch 6/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 15ms/step - accuracy: 0.3048 - loss: 0.4936
Epoch 7/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 15ms/step - accuracy: 0.3066 - loss: 0.4710
Epoch 8/10
[1m22101/22101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m375s[0m 14ms/step - accuracy: 0.3101

<keras.src.callbacks.history.History at 0x7fe7ac8cb490>

In [13]:
def predict(input_word, model, src_vocab, tgt_vocab, rev_tgt, embed_sz=32):
    # Tokenize the input word (convert to sequence)
    input_seq = [[src_vocab.get(char, 0) for char in input_word]]  # Use 0 for unknown chars
    input_seq = pad_sequences(input_seq, padding='post')

    # Create the decoder input sequence with the SOS token
    sos_token = tgt_vocab_len
    dec_input = np.array([[sos_token]])

    # Generate prediction sequence
    predicted_seq = []

    # Start predicting one token at a time until EOS token is reached
    for _ in range(50):  # Limiting the prediction to 50 tokens max
        # Predict the next token probabilities
        pred_probs = model.predict([input_seq, dec_input])

        # Get the predicted token (taking the max probability)
        pred_token = np.argmax(pred_probs[0, -1, :])  # Only consider the last prediction

        # If EOS token is predicted, stop the prediction
        if pred_token == tgt_vocab_len + 1:
            break

        # Add the predicted token to the sequence
        predicted_seq.append(pred_token)

        # Update decoder input for the next time step
        dec_input = np.array([predicted_seq]).reshape(1, len(predicted_seq))

    # Convert the predicted token sequence back to a Hindi word
    predicted_word = ''.join([rev_tgt[token] for token in predicted_seq])

    return predicted_word

# Example usage:
input_word = "anj"  # The input English word you want to transliterate
predicted_hindi = predict(input_word, model, src_vocab, tgt_vocab, rev_tgt)

print(f"Predicted Hindi Transliteration for '{input_word}': {predicted_hindi}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Predicted Hindi Transliteration for 'anj': अंज
