In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from sklearn.model_selection import train_test_split

In [26]:
df = pd.read_csv('/content/Dataset_English_Hindi.csv')


In [27]:
df

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।
...,...,...
130471,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
130472,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
130473,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
130474,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [28]:
# Clean the data: handle NaN values if any
df = df.dropna()  # Drop rows with NaN values if any
df = df.astype(str)  # Ensure all columns are strings

In [29]:
english = df['English'].tolist()
Hindi = df['Hindi'].tolist()

In [30]:
english_token= Tokenizer()
Hindi_token = Tokenizer()

In [31]:
english_token.fit_on_texts(english)
Hindi_token.fit_on_texts(Hindi)

In [32]:
english_sequences = english_token.texts_to_sequences(english)
hindi_sequences = Hindi_token.texts_to_sequences(Hindi)

In [33]:
def add_start_end_tokens(sequences, start_token, end_token):
    return [[start_token] + seq + [end_token] for seq in sequences]

In [34]:
start_token = len(Hindi_token.word_index) + 1
end_token = len(Hindi_token.word_index) + 2

In [35]:
Hindi_token.word_index['<start>'] = start_token
Hindi_token.word_index['<end>'] = end_token
Hindi_token.index_word[start_token] = '<start>'
Hindi_token.index_word[end_token] = '<end>'

In [36]:
Hindi_sequences = add_start_end_tokens(hindi_sequences, start_token, end_token)

In [37]:
max_english_length = max(len(seq) for seq in english_sequences)
max_hindi_length = max(len(seq) for seq in hindi_sequences)

In [38]:
english_sequences = pad_sequences(english_sequences, maxlen=max_english_length, padding='post')
Hindi_sequences = pad_sequences(Hindi_sequences, maxlen=max_hindi_length, padding='post')

In [39]:
X_train, X_test, y_train, y_test = train_test_split(english_sequences, Hindi_sequences, test_size=0.2,random_state=42)

In [40]:
# Model parameters
embedding_dim = 128  # Reduced from 256
lstm_units = 128     # Reduced from 256

In [41]:
# Encoder
encoder_inputs = Input(shape=(max_english_length,))
encoder_embedding = Embedding(input_dim=len(english_token.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [42]:

# Decoder
decoder_inputs = Input(shape=(max_hindi_length,))
decoder_embedding = Embedding(input_dim=len(Hindi_token.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(Hindi_token.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm_outputs)


In [43]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Fit the model
history = model.fit(
    [X_train, y_train],
    np.expand_dims(y_train, -1),
    epochs=1,
    batch_size=16,  # Further reduced batch size
    validation_data=([X_test, y_test], np.expand_dims(y_test, -1))
)



In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [None]:
decoder_embedding_inf = Embedding(input_dim=len(Hindi_token.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm_inf = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_inf(decoder_embedding_inf, initial_state=decoder_states_inputs)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

In [None]:
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)

In [None]:


def decode_sequence(input_seq):
    # Encode the input sequence to get the initial state
    states_value = encoder_model.predict(input_seq)


    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token


    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)


        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = Hindi_token.index_word.get(sampled_token_index, '')

        print(f'Sampled Token: {sampled_token}')


        if sampled_token == '<end>' or len(decoded_sentence.split()) > max_hindi_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_token


        target_seq[0, 0] = sampled_token_index


        states_value = [h, c]

    return decoded_sentence.strip()

In [None]:
input_seq = english_sequences[0:1]  # Example input
decoded_sentence = decode_sequence(input_seq)
print(f'Translated Sentence: {decoded_sentence}')