<a href="https://colab.research.google.com/github/SOWMYASRI7/DL-ASSIGNMENT-2/blob/main/DL_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.models import Model
import numpy as np
import pandas as pd
import unicodedata
import re

In [None]:
import tensorflow as tf

# Load data (with Devanagari → Latin → freq columns)
def load_data(file_path, num_samples=None):
    input_texts = []
    target_texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if num_samples and i >= num_samples:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                latin_input = preprocess_sentence(parts[1])  # Apply preprocessing
                devanagari_output = preprocess_sentence(parts[0])  # Apply preprocessing
                input_texts.append(latin_input)
                target_texts.append(devanagari_output)
    return input_texts, target_texts


# Tokenize character-wise
def tokenize_char(sequences):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(sequences)
    sequences_tensor = tokenizer.texts_to_sequences(sequences)
    return sequences_tensor, tokenizer

# Pad sequences
def pad_sequences(sequences, maxlen=None):
    return tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen, padding='post')

# Full pipeline
def preprocess_dataset(file_path, num_samples=None):
    input_texts, target_texts = load_data(file_path, num_samples)

    input_tensor_raw, inp_tokenizer = tokenize_char(input_texts)
    target_tensor_raw, targ_tokenizer = tokenize_char(target_texts)

    max_input_len = max(len(seq) for seq in input_tensor_raw)
    max_target_len = max(len(seq) for seq in target_tensor_raw)

    input_tensor = pad_sequences(input_tensor_raw, max_input_len)
    target_tensor = pad_sequences(target_tensor_raw, max_target_len)

    return (input_tensor, target_tensor,
            inp_tokenizer, targ_tokenizer,
            max_input_len, max_target_len)

In [None]:
# Cell 4: Display Sample of Preprocessed Data
num_samples_to_view = 5

print("Sample of Preprocessed Data:")
for i in range(num_samples_to_view):
    decoded_input = ''.join(inp_tokenizer.index_word.get(idx, '') for idx in input_tensor[i] if idx != 0)
    decoded_target = ''.join(targ_tokenizer.index_word.get(idx, '') for idx in target_tensor[i] if idx != 0)
    print(f"Latin Input (Encoded): {input_tensor[i]}")
    print(f"Latin Input (Decoded): {decoded_input}")
    print(f"Devanagari Target (Encoded): {target_tensor[i]}")
    print(f"Devanagari Target (Decoded): {decoded_target}")
    print('-' * 50)



Sample of Preprocessed Data:
Latin Input (Encoded): [1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Latin Input (Decoded): an
Devanagari Target (Encoded): [ 1 10  8  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
Devanagari Target (Decoded): 	अं

--------------------------------------------------
Latin Input (Encoded): [ 1  3  4 19  1  3  2  7  0  0  0  0  0  0  0  0  0  0]
Latin Input (Decoded): ankganit
Devanagari Target (Encoded): [ 1 10  8  4 28 40  9 11  2  0  0  0  0  0  0  0  0  0  0  0]
Devanagari Target (Decoded): 	अंकगणित

--------------------------------------------------
Latin Input (Encoded): [11  3 16 13  8  0  0  0  0  0  0  0  0  0  0  0  0  0]
Latin Input (Decoded): uncle
Devanagari Target (Encoded): [ 1 10  8  4 13  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
Devanagari Target (Decoded): 	अंकल

--------------------------------------------------
Latin Input (Encoded): [ 1  3  4 11  6  0  0  0  0  0  0  0  0  0  0  0  0  0]
Latin Input (Decoded): ankur
Devanagari Target (Enc

In [None]:
from sklearn.model_selection import train_test_split

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.2, random_state=42
)
print(f"\nTraining input shape: {input_tensor_train.shape}")
print(f"Validation input shape: {input_tensor_val.shape}")
print(f"Training target shape: {target_tensor_train.shape}")
print(f"Validation target shape: {target_tensor_val.shape}")



Training input shape: (8000, 18)
Validation input shape: (2000, 18)
Training target shape: (8000, 20)
Validation target shape: (2000, 20)


In [None]:
embedding_dim = 64
hidden_units = 128
cell_type = 'lstm'  # You can experiment with 'rnn' or 'gru'
num_encoder_layers = 1

encoder_inputs = tf.keras.layers.Input(shape=(None,), name='encoder_input')
encoder_embedding = tf.keras.layers.Embedding(len(inp_tokenizer.word_index) + 1, embedding_dim, name='encoder_embedding')(encoder_inputs)
encoder_outputs = encoder_embedding
encoder_states = []

for i in range(num_encoder_layers):
    if cell_type == 'lstm':
        encoder_lstm = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True, name=f'encoder_lstm_{i}')
        encoder_outputs, state_h, state_c = encoder_lstm(encoder_outputs)
        encoder_states.extend([state_h, state_c])
    elif cell_type == 'gru':
        encoder_gru = tf.keras.layers.GRU(hidden_units, return_sequences=True, return_state=True, name=f'encoder_gru_{i}')
        encoder_outputs, state_h = encoder_gru(encoder_outputs)
        encoder_states.append(state_h)
    elif cell_type == 'rnn':
        encoder_rnn = tf.keras.layers.SimpleRNN(hidden_units, return_sequences=True, return_state=True, name=f'encoder_rnn_{i}')
        encoder_outputs, state_h = encoder_rnn(encoder_outputs)
        encoder_states.append(state_h)

encoder = tf.keras.models.Model(encoder_inputs, encoder_states)


In [None]:
# Cell 7: Define the Decoder Model
num_decoder_layers = 1

decoder_inputs = tf.keras.layers.Input(shape=(None,), name='decoder_input')
decoder_embedding = tf.keras.layers.Embedding(len(targ_tokenizer.word_index) + 1, embedding_dim, name='decoder_embedding')(decoder_inputs)
decoder_outputs = decoder_embedding
decoder_states_inputs = []
decoder_states = []

for i in range(num_decoder_layers):
    if cell_type == 'lstm':
        state_h_in = tf.keras.layers.Input(shape=(hidden_units,), name=f'decoder_state_h_in_{i}')
        state_c_in = tf.keras.layers.Input(shape=(hidden_units,), name=f'decoder_state_c_in_{i}')
        decoder_states_inputs.extend([state_h_in, state_c_in])
        decoder_lstm = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True, name=f'decoder_lstm_{i}')
        decoder_outputs, state_h_out, state_c_out = decoder_lstm(decoder_outputs, initial_state=[state_h_in, state_c_in])
        decoder_states.extend([state_h_out, state_c_out])
    elif cell_type == 'gru':
        state_h_in = tf.keras.layers.Input(shape=(hidden_units,), name=f'decoder_state_h_in_{i}')
        decoder_states_inputs.append(state_h_in)
        decoder_gru = tf.keras.layers.GRU(hidden_units, return_sequences=True, return_state=True, name=f'decoder_gru_{i}')
        decoder_outputs, state_h_out = decoder_gru(decoder_outputs, initial_state=state_h_in)
        decoder_states.append(state_h_out)
    elif cell_type == 'rnn':
        state_h_in = tf.keras.layers.Input(shape=(hidden_units,), name=f'decoder_state_h_in_{i}')
        decoder_states_inputs.append(state_h_in)
        decoder_rnn = tf.keras.layers.SimpleRNN(hidden_units, return_sequences=True, return_state=True, name=f'decoder_rnn_{i}')
        decoder_outputs, state_h_out = decoder_rnn(decoder_outputs, initial_state=state_h_in)
        decoder_states.append(state_h_out)

decoder_dense = tf.keras.layers.Dense(len(targ_tokenizer.word_index) + 1, activation='softmax', name='decoder_output')
decoder_outputs = decoder_dense(decoder_outputs)
decoder = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)


In [None]:
# Cell 8: Define the Seq2Seq Model for Training
model_inputs = [encoder_inputs, decoder_inputs]
initial_decoder_states = encoder_states[:num_decoder_layers] if cell_type != 'lstm' else encoder_states[:2*num_decoder_layers]

# Ensure initial_decoder_states is a list of tensors
if not isinstance(initial_decoder_states, list):
    initial_decoder_states = [initial_decoder_states]

model_outputs = decoder([decoder_inputs] + initial_decoder_states)[0]
model = tf.keras.models.Model(model_inputs, model_outputs)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn)

# Print model summary (optional)
model.summary()

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
from tensorflow.keras.callbacks import ModelCheckpoint

# --- Define Preprocessing Functions ---
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if unicodedata.category(c) != 'Mn')

def preprocess_latin(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'["\\]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    w = '\t' + w + '\n'
    return w

def preprocess_devanagari(w):
    w = w.strip()
    w = '\t' + w + '\n'
    return w

def load_data(file_path, num_samples=None):
    input_texts = []
    target_texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if num_samples and i >= num_samples:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                latin_input = preprocess_latin(parts[1])
                devanagari_output = preprocess_devanagari(parts[0])
                input_texts.append(latin_input)
                target_texts.append(devanagari_output)
    return input_texts, target_texts

def tokenize_char(sequences):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(sequences)
    sequences_tensor = tokenizer.texts_to_sequences(sequences)
    return sequences_tensor, tokenizer

def pad_sequences(sequences, maxlen=None, padding='post'):
    return tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen, padding=padding)

def preprocess_dataset(file_path, num_samples=None):
    input_texts, target_texts = load_data(file_path, num_samples)
    input_tensor_raw, inp_tokenizer = tokenize_char(input_texts)
    target_tensor_raw, targ_tokenizer = tokenize_char(target_texts)
    max_input_len = max(len(seq) for seq in input_tensor_raw)
    max_target_len = max(len(seq) for seq in target_tensor_raw)
    input_tensor = pad_sequences(input_tensor_raw, max_input_len)
    target_tensor = pad_sequences(target_tensor_raw, max_target_len)
    return input_tensor, target_tensor, inp_tokenizer, targ_tokenizer, max_input_len, max_target_len

# --- Build Seq2Seq Model ---
def build_model(num_encoder_tokens, num_decoder_tokens, embedding_dim=256, latent_dim=256):
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(num_encoder_tokens, embedding_dim, name='encoder_embedding')(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True, name='encoder_lstm')(encoder_embedding)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(num_decoder_tokens, embedding_dim, name='decoder_embedding')(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

# --- Inference Models ---
def build_inference_models(model, latent_dim, num_encoder_tokens, num_decoder_tokens):
    encoder_inputs = model.input[0]
    encoder_embedding = model.get_layer('encoder_embedding')(encoder_inputs)
    encoder_outputs, state_h_enc, state_c_enc = model.get_layer('encoder_lstm')(encoder_embedding)
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1]
    decoder_state_input_h = Input(shape=(latent_dim,), name='input_h')
    decoder_state_input_c = Input(shape=(latent_dim,), name='input_c')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_embedding = model.get_layer('decoder_embedding')(decoder_inputs)
    decoder_lstm = model.get_layer('decoder_lstm')
    decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h_dec, state_c_dec]
    decoder_dense = model.get_layer('decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    return encoder_model, decoder_model

# --- Decode Sequence ---
def decode_sequence(input_seq, encoder_model, decoder_model, targ_tokenizer, max_target_len):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = targ_tokenizer.word_index['\t']
    stop_condition = False
    decoded_sentence = ''

    reverse_target_char_index = dict((i, char) for char, i in targ_tokenizer.word_index.items())

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index.get(sampled_token_index, '')
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or len(decoded_sentence) > max_target_len):
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

# --- Train the Model ---
def train_model(file_path, num_samples=10000, embedding_dim=256, latent_dim=256, batch_size=64, epochs=30):
    input_tensor, target_tensor, inp_tokenizer, targ_tokenizer, max_input_len, max_target_len = preprocess_dataset(file_path, num_samples)

    # Split only once
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

    num_encoder_tokens = len(inp_tokenizer.word_index) + 1
    num_decoder_tokens = len(targ_tokenizer.word_index) + 1

    # Create decoder input/target for training and validation
    decoder_input_train = target_tensor_train[:, :-1]
    decoder_target_train = target_tensor_train[:, 1:]
    decoder_input_val = target_tensor_val[:, :-1]
    decoder_target_val = target_tensor_val[:, 1:]

    model = build_model(num_encoder_tokens, num_decoder_tokens, embedding_dim, latent_dim)
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

    model.summary()

    checkpoint_callback = ModelCheckpoint('model_weights.h5', save_best_only=True, monitor='val_loss', mode='min', verbose=1)

    model.fit([input_tensor_train, decoder_input_train], decoder_target_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=([input_tensor_val, decoder_input_val], decoder_target_val),
              callbacks=[checkpoint_callback],
              verbose=1)

    encoder_model, decoder_model = build_inference_models(model, latent_dim, num_encoder_tokens, num_decoder_tokens)
    return model, encoder_model, decoder_model, inp_tokenizer, targ_tokenizer, max_target_len


    # Define checkpoint callback to save the best model
    checkpoint_callback = ModelCheckpoint('model_weights.h5', save_best_only=True, monitor='val_loss', mode='min', verbose=1)

    model.fit([input_tensor_train, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=([input_tensor_val, decoder_input_data], decoder_target_data),
              callbacks=[checkpoint_callback],
              verbose=1)  # Adds more training feedback

    encoder_model, decoder_model = build_inference_models(model, latent_dim, num_encoder_tokens, num_decoder_tokens)
    return model, encoder_model, decoder_model, inp_tokenizer, targ_tokenizer, max_target_len


In [4]:
model, encoder_model, decoder_model, inp_tokenizer, targ_tokenizer, max_target_len = train_model('/content/hi.translit.sampled.train.tsv', num_samples=10000)


Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step - loss: 1.6092
Epoch 1: val_loss improved from inf to 1.15609, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 305ms/step - loss: 1.6068 - val_loss: 1.1561
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step - loss: 1.1333
Epoch 2: val_loss improved from 1.15609 to 1.06506, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 292ms/step - loss: 1.1331 - val_loss: 1.0651
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 264ms/step - loss: 1.0309
Epoch 3: val_loss improved from 1.06506 to 1.00790, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 285ms/step - loss: 1.0308 - val_loss: 1.0079
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 266ms/step - loss: 0.9701
Epoch 4: val_loss improved from 1.00790 to 0.93309, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 308ms/step - loss: 0.9700 - val_loss: 0.9331
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step - loss: 0.9141
Epoch 5: val_loss improved from 0.93309 to 0.90250, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 305ms/step - loss: 0.9141 - val_loss: 0.9025
Epoch 6/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step - loss: 0.8871
Epoch 6: val_loss did not improve from 0.90250
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 323ms/step - loss: 0.8870 - val_loss: 0.9037
Epoch 7/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step - loss: 0.8606
Epoch 7: val_loss improved from 0.90250 to 0.85424, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 293ms/step - loss: 0.8606 - val_loss: 0.8542
Epoch 8/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - loss: 0.8361
Epoch 8: val_loss improved from 0.85424 to 0.83124, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 293ms/step - loss: 0.8361 - val_loss: 0.8312
Epoch 9/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step - loss: 0.8194
Epoch 9: val_loss improved from 0.83124 to 0.81907, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 292ms/step - loss: 0.8193 - val_loss: 0.8191
Epoch 10/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - loss: 0.7962
Epoch 10: val_loss improved from 0.81907 to 0.80532, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 294ms/step - loss: 0.7962 - val_loss: 0.8053
Epoch 11/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - loss: 0.7748
Epoch 11: val_loss improved from 0.80532 to 0.78438, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 308ms/step - loss: 0.7747 - val_loss: 0.7844
Epoch 12/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step - loss: 0.7535
Epoch 12: val_loss improved from 0.78438 to 0.76634, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 293ms/step - loss: 0.7534 - val_loss: 0.7663
Epoch 13/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - loss: 0.7272
Epoch 13: val_loss improved from 0.76634 to 0.75580, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 294ms/step - loss: 0.7272 - val_loss: 0.7558
Epoch 14/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - loss: 0.7110
Epoch 14: val_loss improved from 0.75580 to 0.72667, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 295ms/step - loss: 0.7109 - val_loss: 0.7267
Epoch 15/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - loss: 0.6810
Epoch 15: val_loss improved from 0.72667 to 0.70355, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 294ms/step - loss: 0.6810 - val_loss: 0.7036
Epoch 16/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step - loss: 0.6611
Epoch 16: val_loss improved from 0.70355 to 0.69032, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 298ms/step - loss: 0.6611 - val_loss: 0.6903
Epoch 17/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 266ms/step - loss: 0.6356
Epoch 17: val_loss improved from 0.69032 to 0.66054, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 292ms/step - loss: 0.6356 - val_loss: 0.6605
Epoch 18/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step - loss: 0.6232
Epoch 18: val_loss improved from 0.66054 to 0.64049, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 312ms/step - loss: 0.6232 - val_loss: 0.6405
Epoch 19/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step - loss: 0.5952
Epoch 19: val_loss improved from 0.64049 to 0.64012, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 295ms/step - loss: 0.5952 - val_loss: 0.6401
Epoch 20/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - loss: 0.5738
Epoch 20: val_loss improved from 0.64012 to 0.60313, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 296ms/step - loss: 0.5738 - val_loss: 0.6031
Epoch 21/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step - loss: 0.5454
Epoch 21: val_loss improved from 0.60313 to 0.58599, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 318ms/step - loss: 0.5454 - val_loss: 0.5860
Epoch 22/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step - loss: 0.5260
Epoch 22: val_loss improved from 0.58599 to 0.57052, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 288ms/step - loss: 0.5260 - val_loss: 0.5705
Epoch 23/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - loss: 0.5056
Epoch 23: val_loss improved from 0.57052 to 0.56138, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 291ms/step - loss: 0.5056 - val_loss: 0.5614
Epoch 24/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step - loss: 0.4823
Epoch 24: val_loss improved from 0.56138 to 0.53855, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 313ms/step - loss: 0.4823 - val_loss: 0.5386
Epoch 25/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - loss: 0.4530
Epoch 25: val_loss improved from 0.53855 to 0.50602, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 296ms/step - loss: 0.4531 - val_loss: 0.5060
Epoch 26/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step - loss: 0.4369
Epoch 26: val_loss improved from 0.50602 to 0.49260, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 290ms/step - loss: 0.4369 - val_loss: 0.4926
Epoch 27/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step - loss: 0.4113
Epoch 27: val_loss improved from 0.49260 to 0.47062, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 301ms/step - loss: 0.4113 - val_loss: 0.4706
Epoch 28/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step - loss: 0.3842
Epoch 28: val_loss improved from 0.47062 to 0.45031, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 292ms/step - loss: 0.3843 - val_loss: 0.4503
Epoch 29/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - loss: 0.3673
Epoch 29: val_loss improved from 0.45031 to 0.43046, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 329ms/step - loss: 0.3673 - val_loss: 0.4305
Epoch 30/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step - loss: 0.3401
Epoch 30: val_loss improved from 0.43046 to 0.42762, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 293ms/step - loss: 0.3401 - val_loss: 0.4276


In [None]:
if __name__ == '__main__':
    # File paths
    train_file = '/content/hi.translit.sampled.train.tsv'
    test_file = '/content/hi.translit.sampled.test.tsv'
    num_samples = 10000

    # Train the model on training data
    (model, encoder_model, decoder_model,
     inp_tokenizer, targ_tokenizer,
     max_target_len, train_input_texts, train_target_texts) = \
        train_model(train_file, num_samples)

    # Preprocess test data (use all samples)
    (test_input_tensor, test_target_tensor,
     _, _, _, _,
     test_input_texts, test_target_texts) = \
        preprocess_dataset(test_file, None)

    # --- Sample Predictions from Test Set ---
    print("Sample predictions on test set:")
    for i in range(5):
        seq = test_input_tensor[i:i+1]
        latin = test_input_texts[i].strip()
        true_dev = test_target_texts[i].strip()
        pred_dev = decode_sequence(seq, encoder_model, decoder_model, targ_tokenizer, max_target_len).strip()
        print(f"{i+1}. Latin: {latin}  | True: {true_dev}  | Pred: {pred_dev}")

    # --- Compute Test Accuracy ---
    num_test = len(test_input_texts)
    correct = 0
    for i in range(num_test):
        seq = test_input_tensor[i:i+1]
        true_txt = test_target_texts[i].strip()
        pred_txt = decode_sequence(seq, encoder_model, decoder_model, targ_tokenizer, max_target_len).strip()
        if pred_txt == true_txt:
            correct += 1
    accuracy = correct / num_test if num_test > 0 else 0
    print(f'Test set accuracy over {num_test} samples: {accuracy:.2%}')


Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step - loss: 1.6172
Epoch 1: val_loss improved from inf to 1.14825, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 307ms/step - loss: 1.6147 - val_loss: 1.1482
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step - loss: 1.1315
Epoch 2: val_loss improved from 1.14825 to 1.03900, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 293ms/step - loss: 1.1313 - val_loss: 1.0390
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step - loss: 1.0237
Epoch 3: val_loss improved from 1.03900 to 0.99654, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 292ms/step - loss: 1.0236 - val_loss: 0.9965
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - loss: 0.9744
Epoch 4: val_loss improved from 0.99654 to 0.94884, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 298ms/step - loss: 0.9744 - val_loss: 0.9488
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step - loss: 0.9336
Epoch 5: val_loss improved from 0.94884 to 0.90298, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 309ms/step - loss: 0.9336 - val_loss: 0.9030
Epoch 6/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step - loss: 0.8942
Epoch 6: val_loss improved from 0.90298 to 0.88533, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 296ms/step - loss: 0.8942 - val_loss: 0.8853
Epoch 7/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - loss: 0.8665
Epoch 7: val_loss improved from 0.88533 to 0.84769, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 296ms/step - loss: 0.8664 - val_loss: 0.8477
Epoch 8/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step - loss: 0.8431
Epoch 8: val_loss improved from 0.84769 to 0.82468, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 295ms/step - loss: 0.8431 - val_loss: 0.8247
Epoch 9/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step - loss: 0.8201
Epoch 9: val_loss improved from 0.82468 to 0.82388, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 288ms/step - loss: 0.8201 - val_loss: 0.8239
Epoch 10/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step - loss: 0.7925
Epoch 10: val_loss improved from 0.82388 to 0.79733, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 312ms/step - loss: 0.7925 - val_loss: 0.7973
Epoch 11/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 289ms/step - loss: 0.7775
Epoch 11: val_loss improved from 0.79733 to 0.77405, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 308ms/step - loss: 0.7775 - val_loss: 0.7741
Epoch 12/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step - loss: 0.7464
Epoch 12: val_loss improved from 0.77405 to 0.74198, saving model to model_weights.h5




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 299ms/step - loss: 0.7464 - val_loss: 0.7420
Epoch 13/30
[1m 70/125[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m15s[0m 284ms/step - loss: 0.7208

In [None]:
# Preprocess the dataset
file_path = '/content/hi.translit.sampled.train.tsv'  # Update with the correct file path
input_tensor, target_tensor, inp_tokenizer, targ_tokenizer, max_input_len, max_target_len = preprocess_dataset(file_path, num_samples=10000)

# Example: Check the preprocessed data
print("Sample Latin Input (Decoded):", input_tensor[0])
print("Sample Devanagari Target (Decoded):", target_tensor[0])

# Now, you can use the preprocessed data for training your Seq2Seq model.


Sample Latin Input (Decoded): [2 1 5 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Sample Devanagari Target (Decoded): [ 1 10  8  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [None]:
# Example testing on a Latin input
input_seq = input_tensor[0:1]  # Sample Latin input sequence (reshape it to match the input format)
# Decode the sequence to get the Devanagari transliteration
decoded_sentence = decode_sequence(input_seq, encoder_model, decoder_model, targ_tokenizer, max_target_len)

# Print the predicted Devanagari transliteration
print("Predicted Devanagari Transliteration:", decoded_sentence)


NameError: name 'encoder_model' is not defined