<a href="https://colab.research.google.com/github/SouravDasz/seq-2-seq-projects/blob/main/Untitled63.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd

In [59]:
df=pd.read_csv("/content/en_bn.csv")

In [60]:
df.head()

Unnamed: 0,en_text,bn_text
0,"Hello, good to see you!","হ্যালো, আপনাকে দেখে ভালো লাগলো!"
1,"Goodbye, have a great day!","বিদায়, আপনার দিন শুভ হোক!"
2,Thank you very much for your help.,আপনার সহায়তার জন্য অনেক ধন্যবাদ।
3,I appreciate it.,আমি এটি প্রশংসা করি।
4,You're welcome!,আপনার স্বাগতম!


# inporting library

In [6]:
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense,LSTM,Embedding,Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras .models import Model

In [7]:
df.shape

(120000, 2)

In [8]:
import re
def process(x):
  x=re.sub("[^a-zA-Z]"," ",x)
  x=x.lower()
  return x

In [9]:
df["en_text"]=df["en_text"].apply(process)

In [10]:
def process_bn(x):
    if not isinstance(x, str):
        return ""
    x = re.sub(r"[^a-zA-Z\u0980-\u09FF]", " ", x)
    x = x.lower()
    return x


In [11]:
df["bn_text"]=df["bn_text"].apply(process_bn)
df["bn_text"]=df["bn_text"].apply(lambda x:'start_'+x+"_end")

In [12]:
df

Unnamed: 0,en_text,bn_text
0,hello good to see you,start_হ্যালো আপনাকে দেখে ভালো লাগলো _end
1,goodbye have a great day,start_বিদায় আপনার দিন শুভ হোক _end
2,thank you very much for your help,start_আপনার সহায়তার জন্য অনেক ধন্যবাদ _end
3,i appreciate it,start_আমি এটি প্রশংসা করি _end
4,you re welcome,start_আপনার স্বাগতম _end
...,...,...
119995,the adventure began on a misty morning,start_অভিযান শুরু হয়েছিল কুয়াশাচ্ছন্ন সকালে ...
119996,greetings it is an honor to meet you,start_শুভেচ্ছা আপনাকে সম্মান জানাতে পেরে আমি ...
119997,hey what s up,start_আরে কেমন আছ _end
119998,please accept my heartfelt congratulations,start_অনুগ্রহ করে আমার আন্তরিক অভিনন্দন গ্রহণ ...


In [13]:
eng_tokenizer=Tokenizer()
eng_tokenizer.fit_on_texts(df["en_text"])
df["en_text"] =eng_tokenizer.texts_to_sequences(df["en_text"])

In [14]:
ben_tokenizer=Tokenizer()
ben_tokenizer.fit_on_texts(df["bn_text"])
df["bn_text"] =ben_tokenizer.texts_to_sequences(df["bn_text"])

In [15]:
df

Unnamed: 0,en_text,bn_text
0,"[25, 14, 6, 26, 1]","[1, 24, 10, 25, 11, 26, 2]"
1,"[27, 15, 2, 28, 16]","[1, 27, 3, 12, 28, 29, 2]"
2,"[29, 1, 30, 31, 17, 5, 32]","[1, 3, 30, 31, 32, 13, 2]"
3,"[7, 33, 8]","[1, 5, 14, 33, 34, 2]"
4,"[1, 34, 35]","[1, 3, 35, 2]"
...,...,...
119995,"[3, 105, 106, 107, 2, 108, 109]","[1, 110, 111, 112, 113, 114, 2]"
119996,"[110, 8, 4, 111, 112, 6, 113, 1]","[1, 115, 10, 116, 117, 118, 5, 119, 2]"
119997,"[114, 11, 24, 115]","[1, 120, 7, 121, 2]"
119998,"[12, 116, 117, 118, 119]","[1, 122, 16, 123, 124, 125, 18, 6, 2]"


In [16]:
eng_max_len=max(len(i) for i in df["en_text"])
print("max len of english sequence  -->",eng_max_len)
ben_max_len=max(len(i) for i in df["bn_text"])
print("max len of bengali sequence  -->",ben_max_len)


max len of english sequence  --> 12
max len of bengali sequence  --> 12


In [26]:
encoder_input=Input(shape=(None,))
encoder_embedding_layer = Embedding(input_dim=len(eng_tokenizer.word_index)+1,output_dim=50)
enc_embedding_output = encoder_embedding_layer(encoder_input)
enc_output,state_h,state_c=LSTM(128,return_state=True)(enc_embedding_output)
encoder_states=[state_h,state_c]

In [27]:
decoder_input=Input(shape=(None,))
decoder_embedding_layer=Embedding(input_dim=len(ben_tokenizer.word_index)+1,output_dim=50)
dec_embedding_output=decoder_embedding_layer(decoder_input)
dec_lstm=LSTM(128,return_state=True,return_sequences=True)
dec_output,_,_=dec_lstm(dec_embedding_output,initial_state=encoder_states)
dec_dense=Dense(len(ben_tokenizer.word_index)+1,activation="softmax")
decoder_output=dec_dense(dec_output)

In [19]:
model=Model([encoder_input,decoder_input],decoder_output)
model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])


In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad English sequences for encoder input
encoder_input_data = pad_sequences(df["en_text"], maxlen=eng_max_len, padding="post")

# Prepare decoder input data:
# - We need to remove the '_end' token from the target sequence for the decoder input.
# - Then pad the sequences to the maximum Bengali sequence length.
decoder_input_data = pad_sequences([seq[:-1] for seq in df["bn_text"]], maxlen=ben_max_len -1, padding="post")

# Prepare decoder target data:
# - We need to remove the 'start_' token from the target sequence for the decoder output.
# - Then pad the sequences to the maximum Bengali sequence length.
decoder_target_data = pad_sequences([seq[1:] for seq in df["bn_text"]], maxlen=ben_max_len -1, padding="post")

print("Shape of encoder_input_data:", encoder_input_data.shape)
print("Shape of decoder_input_data:", decoder_input_data.shape)
print("Shape of decoder_target_data:", decoder_target_data.shape)


Shape of encoder_input_data: (120000, 12)
Shape of decoder_input_data: (120000, 11)
Shape of decoder_target_data: (120000, 11)


In [24]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=200, epochs=15, validation_split=0.2)

Epoch 1/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.6730 - loss: 1.7687 - val_accuracy: 1.0000 - val_loss: 0.0177
Epoch 2/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0116 - val_accuracy: 1.0000 - val_loss: 0.0038
Epoch 3/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0030 - val_accuracy: 1.0000 - val_loss: 0.0016
Epoch 4/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 1.0000 - val_loss: 9.0481e-04
Epoch 5/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 1.0000 - loss: 8.0197e-04 - val_accuracy: 1.0000 - val_loss: 5.5893e-04
Epoch 6/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 1.0000 - loss: 5.0483e-04 - val_accuracy: 1.0000 - val_loss: 3.6961e-04
Epo

<keras.src.callbacks.history.History at 0x7e92afaa5fd0>

In [30]:
encoder_model_inference = Model(encoder_input, encoder_states)

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_ing_emb = decoder_embedding_layer(decoder_input)
dec_output, dec_state_h, dec_state = dec_lstm(dec_ing_emb,initial_state=decoder_states_inputs)
decoder_outputs = dec_dense(dec_output)
decoder_model_inference = Model([decoder_input] + decoder_states_inputs, [decoder_outputs] + [dec_state_h, dec_state])

In [33]:
reverse_ben_word_index = dict(map(reversed, ben_tokenizer.word_index.items()))

def translate_sentence(input_sentence):
    # 1. Preprocess the input English sentence
    processed_sentence = process(input_sentence) # Assuming 'process' function is defined
    input_sequence = eng_tokenizer.texts_to_sequences([processed_sentence])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=eng_max_len, padding='post')

    # 2. Get the initial states from the encoder
    states_value = encoder_model_inference.predict(padded_input_sequence)

    # 3. Initialize the decoder with a 'start' token (corrected from 'start_')
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = ben_tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ''

    # 4. Loop to predict the next word until an 'end' token is predicted or max length is reached
    while not stop_condition:
        output_tokens, h, c = decoder_model_inference.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_ben_word_index.get(sampled_token_index, '')

        if sampled_word == 'end': # Corrected from '_end'
            stop_condition = True
        elif sampled_word != 'start': # Avoid adding 'start' to the output (corrected from 'start_')
            decoded_sentence += ' ' + sampled_word

        # Exit condition: Either hit max length or stop token
        if len(decoded_sentence.split()) >= ben_max_len - 1: # -1 to account for 'start' and 'end'
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

In [34]:
sentence="how are you"
translate_sentence(sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


'স্থল পৌঁছাতে পৌঁছাতে আমাকে আমাকে আমাকে একটি একটি একটি তখন একটি'

### Improving Translation with Attention Mechanism

While our previous model achieved high accuracy on the training data, the generated translations were repetitive and often nonsensical. This is a common issue with basic Encoder-Decoder models, where the fixed-size context vector (the final hidden states of the encoder) struggles to capture all the necessary information for longer sequences.

An **Attention Mechanism** addresses this by allowing the decoder to look at the entire encoder output sequence (not just the last hidden states) at each decoding step. It computes a weighted sum of the encoder's hidden states, where the weights determine how much 'attention' the decoder pays to each part of the input. This dynamic focus helps the model generate more relevant and accurate translations.

In [35]:
from keras.layers import Attention, Concatenate

# --- Encoder Definition (with return_sequences=True) for Attention ---
encoder_input_attention = Input(shape=(None,))
encoder_embedding_layer_attention = Embedding(input_dim=len(eng_tokenizer.word_index)+1, output_dim=50)
enc_embedding_output_attention = encoder_embedding_layer_attention(encoder_input_attention)
# Encoder LSTM now returns full output sequence along with states
encoder_output_seq, encoder_h, encoder_c = LSTM(128, return_state=True, return_sequences=True)(enc_embedding_output_attention)
encoder_states_attention = [encoder_h, encoder_c]

# --- Decoder Definition with Attention ---
decoder_input_attention = Input(shape=(None,))
decoder_embedding_layer_attention = Embedding(input_dim=len(ben_tokenizer.word_index)+1, output_dim=50)
dec_embedding_output_attention = decoder_embedding_layer_attention(decoder_input_attention)

# Decoder LSTM (initialized with encoder states)
decoder_lstm_attention = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs_seq, _, _ = decoder_lstm_attention(dec_embedding_output_attention, initial_state=encoder_states_attention)

# Attention layer
attention_layer = Attention()
attention_output = attention_layer([decoder_outputs_seq, encoder_output_seq])

# Concatenate attention output with decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs_seq, attention_output])

# Dense output layer
decoder_dense_attention = Dense(len(ben_tokenizer.word_index)+1, activation='softmax')
decoder_output_attention = decoder_dense_attention(decoder_concat_input)

# --- Assemble the new Attention Model ---
model_attention = Model([encoder_input_attention, decoder_input_attention], decoder_output_attention)
model_attention.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("New attention-based model summary:")
model_attention.summary()

New attention-based model summary:


In [36]:
# Retrain the new attention-based model
print("Training the attention-based model...")
model_attention.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=200, epochs=15, validation_split=0.2)

Training the attention-based model...
Epoch 1/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.6919 - loss: 1.6370 - val_accuracy: 1.0000 - val_loss: 0.0108
Epoch 2/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0072 - val_accuracy: 1.0000 - val_loss: 0.0024
Epoch 3/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 0.0011
Epoch 4/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 1.0000 - loss: 9.1735e-04 - val_accuracy: 1.0000 - val_loss: 5.9001e-04
Epoch 5/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 1.0000 - loss: 5.2247e-04 - val_accuracy: 1.0000 - val_loss: 3.6528e-04
Epoch 6/15
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 1.0000 - loss: 3.3006e-04 - val_accu

<keras.src.callbacks.history.History at 0x7e92a01e5fd0>

In [37]:
# --- Define Inference Models for Attention-based Seq2Seq ---

# Encoder Inference Model
encoder_model_inference_attention = Model(encoder_input_attention, [encoder_output_seq, encoder_h, encoder_c])

# Decoder Inference Model
decoder_state_input_h_attention = Input(shape=(128,))
decoder_state_input_c_attention = Input(shape=(128,))
encoder_output_seq_input = Input(shape=(None, 128)) # To feed encoder_output_seq to decoder

decoder_states_inputs_attention = [decoder_state_input_h_attention, decoder_state_input_c_attention]

dec_embedding_output_inference = decoder_embedding_layer_attention(decoder_input_attention)

# Pass the encoder output sequence to the decoder LSTM along with the initial states
dec_output_inference, dec_state_h_inference, dec_state_c_inference = decoder_lstm_attention(
    dec_embedding_output_inference, initial_state=decoder_states_inputs_attention
)

# Attention in inference
attention_output_inference = attention_layer([dec_output_inference, encoder_output_seq_input])
decoder_concat_input_inference = Concatenate(axis=-1)([dec_output_inference, attention_output_inference])

decoder_outputs_inference = decoder_dense_attention(decoder_concat_input_inference)

decoder_model_inference_attention = Model(
    [decoder_input_attention, encoder_output_seq_input] + decoder_states_inputs_attention,
    [decoder_outputs_inference, dec_state_h_inference, dec_state_c_inference]
)

print("Attention-based inference models created.")

Attention-based inference models created.


In [38]:
# --- Updated translate_sentence function using Attention-based models ---

def translate_sentence_attention(input_sentence):
    # 1. Preprocess the input English sentence
    processed_sentence = process(input_sentence)
    input_sequence = eng_tokenizer.texts_to_sequences([processed_sentence])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=eng_max_len, padding='post')

    # 2. Get the initial states and encoder output sequence from the encoder
    encoder_outputs_from_inference, h, c = encoder_model_inference_attention.predict(padded_input_sequence)
    states_value = [h, c]

    # 3. Initialize the decoder with a 'start' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = ben_tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ''

    # 4. Loop to predict the next word
    while not stop_condition:
        output_tokens, h, c = decoder_model_inference_attention.predict(
            [target_seq, encoder_outputs_from_inference] + states_value
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_ben_word_index.get(sampled_token_index, '')

        if sampled_word == 'end':
            stop_condition = True
        elif sampled_word != 'start':
            decoded_sentence += ' ' + sampled_word

        # Exit condition: Either hit max length or stop token
        if len(decoded_sentence.split()) >= ben_max_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

print("Updated 'translate_sentence_attention' function defined.")

Updated 'translate_sentence_attention' function defined.


In [63]:
sentences = df["en_text"].sample(10, random_state=42)

for i in range(len(sentences)):
    src_sentence = sentences.iloc[i]

    print(f"Original sentence: {src_sentence}")
    print(f"Translated sentence (with attention): {translate_sentence_attention(src_sentence)}")
    print("-" * 50)


Original sentence: Hey, what's up?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Translated sentence (with attention): আরে কেমন আছ
--------------------------------------------------
Original sentence: Bengaluru is known as the Silicon Valley of India.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[