In [32]:
# Importing libraries
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [33]:
# Importing Data
df = pd.read_csv("/content/Dataset_English_Hindi.csv")
df.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [34]:
df = df.copy().dropna().sample(5000)
df.shape

(5000, 2)

In [35]:
# Sample dataset (10 English-Hindi pairs)
english_sentences = df['English']
hindi_sentences = df['Hindi']

In [36]:
# Add start/end tokens to target sequences
START_TOKEN = '<start>'
END_TOKEN = '<end>' # Stopping indication for decoder output
hindi_sentences = [f'{START_TOKEN} {s} {END_TOKEN}' for s in hindi_sentences]

In [37]:
# Tokenization and preprocessing
eng_tokenizer = Tokenizer(char_level = False, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = " ", oov_token = "UNK")
hin_tokenizer = Tokenizer(char_level = False, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = False, split = " ", oov_token = "UNK")

# Fitting
eng_tokenizer.fit_on_texts(english_sentences)
hin_tokenizer.fit_on_texts(hindi_sentences)

In [38]:
len(hin_tokenizer.word_index)

14415

In [39]:
len(eng_tokenizer.word_index)

12206

In [40]:
# Vocabulary length
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1

In [41]:
# Convert text to sequences
encoder_inputs = eng_tokenizer.texts_to_sequences(english_sentences)
decoder_inputs = hin_tokenizer.texts_to_sequences(hindi_sentences)

In [42]:
# Pad sequences
encoder_inputs = pad_sequences(encoder_inputs, padding = 'post')
decoder_inputs = pad_sequences(decoder_inputs, padding = 'post')

In [43]:
encoder_inputs.shape

(5000, 245)

In [44]:
decoder_inputs.shape

(5000, 215)

In [45]:
# Prepare target data (shifted by one timestep)
decoder_targets = []
for seq in decoder_inputs:
    decoder_targets.append(seq[1:])

decoder_targets = np.array(decoder_targets)

In [46]:
decoder_inputs = decoder_inputs[:, :-1].copy() # Removing <end> token

In [None]:
decoder_targets = pad_sequences(decoder_targets, padding='post')

In [50]:
# Model parameters
latent_dim = 500  # LSTM dimensionality
embedding_dim = 250  # Embedding dimension

In [51]:
# Encoder
encoder_inputs_layer = Input(shape = (encoder_inputs.shape[1],))
encoder_embedding = Embedding(input_dim = eng_vocab_size, output_dim = embedding_dim)(encoder_inputs_layer) # Each word will be represented by 250D

encoder_lstm = LSTM(units = latent_dim, return_state = True, dropout = 0.25)
_, last_state_h, last_state_c = encoder_lstm(encoder_embedding)

encoder_states = [last_state_c, last_state_h] # After the final timestep - passing into decoder

In [52]:
# Decoder
decoder_inputs_layer = Input(shape = (decoder_inputs.shape[1],))
decoder_embedding = Embedding(hin_vocab_size, embedding_dim)(decoder_inputs_layer)

decoder_lstm = LSTM(units = latent_dim, return_sequences = True, return_state = True) # Returns all the hidden states as well (return_sequences = True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state = encoder_states) # initial_state -> Your cell and hidden state instead of default random

decoder_dense = Dense(hin_vocab_size, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [53]:
# Define training model
model = Model([encoder_inputs_layer, decoder_inputs_layer], decoder_outputs)
model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [54]:
# Print model summary
model.summary()

In [58]:
# Train the model (using dummy data for demonstration)
# Note: For real usage, use the prepared sequences
model.fit(
    [encoder_inputs, decoder_inputs], # X_train
    np.expand_dims(decoder_targets, -1), # y_train
    batch_size = 32,
    epochs = 10,
    validation_split = 0.2
)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 243ms/step - accuracy: 0.8739 - loss: 2.0270 - val_accuracy: 0.9165 - val_loss: 0.6285
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 242ms/step - accuracy: 0.9173 - loss: 0.6289 - val_accuracy: 0.9213 - val_loss: 0.6061
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 247ms/step - accuracy: 0.9172 - loss: 0.6307 - val_accuracy: 0.9214 - val_loss: 0.6042
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 250ms/step - accuracy: 0.9190 - loss: 0.6152 - val_accuracy: 0.9218 - val_loss: 0.6026
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 252ms/step - accuracy: 0.9199 - loss: 0.6072 - val_accuracy: 0.9218 - val_loss: 0.6030
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 268ms/step - accuracy: 0.9187 - loss: 0.6142 - val_accuracy: 0.9218 - val_loss: 0.6027
Epoch 7/10

<keras.src.callbacks.history.History at 0x7d3cd9f62f10>

In [59]:
# Inference setup

# Creating a different model for inferance with just encoder inputs
encoder_model = Model(encoder_inputs_layer, encoder_states) # Input - encoder_inputs | Outputs - [hidden_state, cell_state]

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] # This has to be passed into decoder as input

decoder_inputs_single = Input(shape=(1,)) # shape = 1 because 1 word at each timestep
dec_emb_single = Embedding(input_dim = hin_vocab_size, output_dim = embedding_dim)(decoder_inputs_single)
decoder_outputs_single, state_h_single, state_c_single = decoder_lstm(
    dec_emb_single, initial_state = decoder_states_inputs
)

decoder_states_single = [state_h_single, state_c_single]
decoder_outputs_single = decoder_dense(decoder_outputs_single)
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs_single] + decoder_states_single
)

In [65]:
# Translation function
def translate(input_text):
    # Tokenize and pad input
    input_seq = eng_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=encoder_inputs.shape[1], padding='post')

    # Encode input
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence
    target_seq = np.zeros((1, 1))
    # Start token
    target_seq[0, 0] = hin_tokenizer.word_index['start']

    decoded_sentence = []
    for _ in range(decoder_inputs.shape[1]):
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value
        )

        # Sample token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = hin_tokenizer.index_word.get(sampled_token_index, '?')

        if sampled_word == 'end':
            break

        decoded_sentence.append(sampled_word)

        # Update target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [70]:
# Test translation
print(translate(english_sentences[127081]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
और के के के
