In [29]:
!pip install tensorflow==2.12.0



In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention,Concatenate
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import corpus_bleu
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
df=pd.read_parquet("D:/100devMl/Projects\Machine-Translation/artifacts/english-to-hindi/data/enTohin.parquet",engine="pyarrow")
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,"However, Paes, who was partnering Australia's ...",आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाल...
1,"Whosoever desires the reward of the world, wit...",और जो शख्स (अपने आमाल का) बदला दुनिया ही में च...
2,The value of insects in the biosphere is enorm...,"जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि ..."
3,Mithali To Anchor Indian Team Against Australi...,आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को
4,After the assent of the Honble President on 8t...,"8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍व..."


In [5]:
x="english_sentence"
df[x].head()

0    However, Paes, who was partnering Australia's ...
1    Whosoever desires the reward of the world, wit...
2    The value of insects in the biosphere is enorm...
3    Mithali To Anchor Indian Team Against Australi...
4    After the assent of the Honble President on 8t...
Name: english_sentence, dtype: object

In [6]:
np.random.seed(42)
tf.random.set_seed(42)

In [7]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [11]:
def tokenize_with_special_tokens(data, lang_key, num_words=None):
    tokenizer = Tokenizer(num_words=num_words, filters='', lower=True, oov_token="<unk>")
    special_tokens = ["<sos>", "<eos>"]
    tokenizer.fit_on_texts(special_tokens + data[lang_key].tolist())
    return tokenizer

In [12]:
en_tokenizer = tokenize_with_special_tokens(train_data, "english_sentence")
hi_tokenizer = tokenize_with_special_tokens(train_data, "hindi_sentence") 

In [13]:
def preprocess_with_special_tokens(data, en_tokenizer, hi_tokenizer, max_seq_len):
    en_sequences = en_tokenizer.texts_to_sequences(data['english_sentence'])
    hi_sequences = ["<sos> " + sent + " <eos>" for sent in data['hindi_sentence']]
    hi_sequences = hi_tokenizer.texts_to_sequences(hi_sequences)

    en_sequences = pad_sequences(en_sequences, maxlen=max_seq_len, padding="post")
    hi_sequences = pad_sequences(hi_sequences, maxlen=max_seq_len + 2, padding="post")

    decoder_input = hi_sequences[:, :-1]
    decoder_target = hi_sequences[:, 1:]

    return en_sequences, decoder_input, decoder_target

In [14]:
max_seq_len = 20
en_train, dec_train_input, dec_train_target = preprocess_with_special_tokens(train_data, en_tokenizer, hi_tokenizer, max_seq_len)
en_val, dec_val_input, dec_val_target = preprocess_with_special_tokens(val_data, en_tokenizer, hi_tokenizer, max_seq_len)
en_test, _, _ = preprocess_with_special_tokens(test_data, en_tokenizer, hi_tokenizer, max_seq_len)

In [17]:
en_train.shape,dec_train_input.shape, dec_train_target.shape

((91947, 20), (91947, 21), (91947, 21))

In [None]:
embedding_dim = 128
hidden_units = 256
vocab_size_en = len(en_tokenizer.word_index) + 1
vocab_size_hi = len(hi_tokenizer.word_index) + 1

In [None]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size_en, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)

In [None]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size_hi, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])

In [None]:
attention_layer = Attention()
context_vector = attention_layer([decoder_outputs, encoder_outputs])

In [None]:
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])
decoder_dense = Dense(vocab_size_hi, activation="softmax")
decoder_outputs = decoder_dense(decoder_combined_context)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 128)    13194880    ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 128)    12033664    ['input_4[0][0]']                
                                                                                            

In [None]:
batch_size = 64
epochs = 1
history = model.fit(
    [en_train, dec_train_input], dec_train_target,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([en_val, dec_val_input], dec_val_target)
)

 111/1437 [=>............................] - ETA: 3:16:22 - loss: 6.1623 - accuracy: 0.3287

KeyboardInterrupt: 

In [None]:
model.save("machineTranslation_1.h5")

In [None]:
def predict_translation(input_text, model, en_tokenizer, hi_tokenizer, max_seq_len):
    input_seq = en_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_seq_len, padding="post")

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hi_tokenizer.word_index['<sos>']

    decoded_sentence = []

    for _ in range(max_seq_len + 2):
        predictions = model.predict([input_seq, target_seq], verbose=0)
        predicted_token = np.argmax(predictions[0, -1, :])

        sampled_word = hi_tokenizer.index_word.get(predicted_token, '<unk>')
        if sampled_word == '<eos>':
            break

        decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = predicted_token

    return ' '.join(decoded_sentence)

In [None]:
input_text = "who are you"
predicted_translation = predict_translation(input_text, model, en_tokenizer, hi_tokenizer, max_seq_len)
print(f"Input: {input_text}")
print(f"Predicted Translation: {predicted_translation}")

Input: who are you
Predicted Translation: और और और और और और और और और और और और और और और और और और और और और और


In [None]:
def evaluate_bleu_score(model, data, en_tokenizer, hi_tokenizer, max_seq_len):
    references = []
    hypotheses = []

    for _, row in data.iterrows():
        input_text = row['english_sentence']
        reference = row['hindi_sentence'].split()

        predicted_translation = predict_translation(input_text, model, en_tokenizer, hi_tokenizer, max_seq_len)
        hypothesis = predicted_translation.split()

        references.append([reference])
        hypotheses.append(hypothesis)

    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score

In [None]:

bleu_score = evaluate_bleu_score(model, test_data, en_tokenizer, hi_tokenizer, max_seq_len)
print(f"Test BLEU Score: {bleu_score}")

KeyboardInterrupt: 

In [None]:
print()