Step 1: import modules


In [3]:
!pip install -U tensorflow -q

# Importing all required modules
import tensorflow as tf
import numpy as np
import string
import re
import io
import os
import zipfile
import requests
from sklearn.model_selection import train_test_split


print("Using TensorFlow version:", tf.__version__)




[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m787.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.18.1 requires tensorflow<2.19,>=2.18.0, but you have tensorflow 2.19.0 which is incompatible.
tf-keras 2.18.0 requires tensorflow<2.19,>=2.18, but you have tensorflow 2.19.0 which is incompatible.
tensorflow-decision-forests 1.11.0 requires tensorflow==2.18.0, but you have tensorflow 2.19.0 which is incompatible.[0m[31m
[0mUsing TensorFlow version: 2.18.0


In [9]:
import os
print(os.getcwd())
print(os.listdir('.'))



/content
['.config', 'drive', 'hi.translit.sampled.train.tsv', 'sample_data']


Step 2:Load dataset



In [12]:
import os
print("Working directory:", os.getcwd())
print("Files here:", os.listdir('.'))


print("\n--- File Preview (first 5 lines) ---")
with open('hi.translit.sampled.train.tsv', 'r', encoding='utf-8') as f:
    for i in range(5):
        line = f.readline()
        if not line:
            break
        print(f"{i+1:>2}: {repr(line)}")


train_path = 'hi.translit.sampled.train.tsv'
input_texts, target_texts = [], []

with open(train_path, 'r', encoding='utf-8') as f:
    for line in f:

        raw = line.rstrip('\n')

        if '\t' in raw:
            parts = raw.split('\t')
        else:

            parts = raw.split()
        if len(parts) >= 2:
            latin, dev = parts[0], parts[1]
            input_texts.append(latin)
            target_texts.append('\t' + dev + '\n')


print(f"\nTotal pairs loaded: {len(input_texts)}")
for i in range(min(5, len(input_texts))):
    print(f"{i+1}. Latin = {input_texts[i]:10} → Deva = {target_texts[i]!r}")




Working directory: /content
Files here: ['.config', 'drive', 'hi.translit.sampled.train.tsv', 'sample_data']

--- File Preview (first 5 lines) ---
 1: 'अं\tan\t3\n'
 2: 'अंकगणित\tankganit\t3\n'
 3: 'अंकल\tuncle\t4\n'
 4: 'अंकुर\tankur\t4\n'
 5: 'अंकुरण\tankuran\t3\n'

Total pairs loaded: 44204
1. Latin = अं         → Deva = '\tan\n'
2. Latin = अंकगणित    → Deva = '\tankganit\n'
3. Latin = अंकल       → Deva = '\tuncle\n'
4. Latin = अंकुर      → Deva = '\tankur\n'
5. Latin = अंकुरण     → Deva = '\tankuran\n'


Step 3: preprocess and vectorize

In [13]:
import numpy as np


all_src_chars = sorted({ch for txt in input_texts for ch in txt})
all_tgt_chars = sorted({ch for txt in target_texts for ch in txt})

num_src_tokens = len(all_src_chars)
num_tgt_tokens = len(all_tgt_chars)


char2src = {ch: i for i, ch in enumerate(all_src_chars)}
char2tgt = {ch: i for i, ch in enumerate(all_tgt_chars)}
src2char = {i: ch for ch, i in char2src.items()}
tgt2char = {i: ch for ch, i in char2tgt.items()}


max_src_len = max(len(txt) for txt in input_texts)
max_tgt_len = max(len(txt) for txt in target_texts)


encoder_input_data = np.zeros(
    (len(input_texts), max_src_len), dtype="int32"
)

decoder_input_data = np.zeros(
    (len(target_texts), max_tgt_len), dtype="int32"
)

decoder_target_data = np.zeros(
    (len(target_texts), max_tgt_len, num_tgt_tokens),
    dtype="float32"
)


for i, (src, tgt) in enumerate(zip(input_texts, target_texts)):
    for t, ch in enumerate(src):
        encoder_input_data[i, t] = char2src[ch]
    for t, ch in enumerate(tgt):
        decoder_input_data[i, t] = char2tgt[ch]

        if t > 0:
            decoder_target_data[i, t-1, char2tgt[ch]] = 1.0

print("≻ Vocabulary sizes:", num_src_tokens, "→", num_tgt_tokens)
print("≻ Sequence lengths:", max_src_len, "→", max_tgt_len)


≻ Vocabulary sizes: 63 → 28
≻ Sequence lengths: 19 → 22


Step 4 : define seq2seq model

In [14]:
import tensorflow as tf

# Hyperparameters
embedding_dim = 128
hidden_units  = 256

# --- Encoder ---
enc_inputs   = tf.keras.Input(shape=(None,), name="enc_inputs")
enc_emb_layer = tf.keras.layers.Embedding(
    input_dim=num_src_tokens,
    output_dim=embedding_dim,
    name="enc_embedding"
)
enc_embedded = enc_emb_layer(enc_inputs)
enc_lstm     = tf.keras.layers.LSTM(
    hidden_units, return_state=True, name="enc_lstm"
)
_, enc_state_h, enc_state_c = enc_lstm(enc_embedded)
encoder_states = [enc_state_h, enc_state_c]

# --- Decoder ---
dec_inputs   = tf.keras.Input(shape=(None,), name="dec_inputs")
dec_emb_layer = tf.keras.layers.Embedding(
    input_dim=num_tgt_tokens,
    output_dim=embedding_dim,
    name="dec_embedding"
)
dec_embedded = dec_emb_layer(dec_inputs)
dec_lstm     = tf.keras.layers.LSTM(
    hidden_units, return_sequences=True, return_state=True, name="dec_lstm"
)
dec_outputs, _, _ = dec_lstm(
    dec_embedded, initial_state=encoder_states
)
dec_dense = tf.keras.layers.Dense(
    num_tgt_tokens, activation="softmax", name="dec_dense"
)
dec_outputs = dec_dense(dec_outputs)


seq2seq_model = tf.keras.Model(
    [enc_inputs, dec_inputs], dec_outputs, name="seq2seq"
)
seq2seq_model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)
seq2seq_model.summary()


Step 5: train the model

In [16]:
batch_size = 32
epochs     = 5

history = seq2seq_model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    shuffle=True
)

Epoch 1/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 175ms/step - accuracy: 0.2958 - loss: 0.2728 - val_accuracy: 0.2420 - val_loss: 0.4846
Epoch 2/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 175ms/step - accuracy: 0.3039 - loss: 0.2455 - val_accuracy: 0.2562 - val_loss: 0.4442
Epoch 3/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 175ms/step - accuracy: 0.3122 - loss: 0.2192 - val_accuracy: 0.2552 - val_loss: 0.4402
Epoch 4/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 176ms/step - accuracy: 0.3149 - loss: 0.2060 - val_accuracy: 0.2669 - val_loss: 0.3969
Epoch 5/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 179ms/step - accuracy: 0.3177 - loss: 0.1943 - val_accuracy: 0.2684 - val_loss: 0.4001


Step 6 :Inference models

In [18]:
# 1) Encoder inference
inf_encoder = tf.keras.Model(enc_inputs, encoder_states)

# 2) Decoder inference
#    - inputs: previous char + previous states
dec_state_input_h = tf.keras.Input(shape=(hidden_units,), name="state_h")
dec_state_input_c = tf.keras.Input(shape=(hidden_units,), name="state_c")
dec_states_inputs = [dec_state_input_h, dec_state_input_c]

dec_embed_inf = dec_emb_layer(dec_inputs)
dec_outputs_inf, state_h_inf, state_c_inf = dec_lstm(
    dec_embed_inf, initial_state=dec_states_inputs
)
dec_states_outputs = [state_h_inf, state_c_inf]
dec_outputs_inf = dec_dense(dec_outputs_inf)

inf_decoder = tf.keras.Model(
    [dec_inputs] + dec_states_inputs,
    [dec_outputs_inf] + dec_states_outputs,
    name="inf_decoder"
)


Step 7: decode and test

In [19]:
def transliterate_sequence(src_seq):
    # Encode input sequence to get initial states
    states_val = inf_encoder.predict(src_seq)

    tgt_seq = np.array([[char2tgt['\t']]])
    output_str = ""


    for _ in range(max_tgt_len):
        preds, h, c = inf_decoder.predict([tgt_seq] + states_val)
        idx = np.argmax(preds[0, -1, :])
        char = tgt2char[idx]
        output_str += char
        if char == '\n':
            break

        tgt_seq = np.array([[idx]])
        states_val = [h, c]
    return output_str


for i in range(10):

    test_input = encoder_input_data[i : i+1]
    pred = transliterate_sequence(test_input)
    print(f"Src   = {input_texts[i]}")
    print(f"True  = {target_texts[i].strip()}")
    print(f"Pred  = {pred.strip()}")
    print("-" * 30)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 482ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Src   = अं
True  = an
Pred  = an
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
