In [2]:
#importing Libraries

import numpy as np
import pandas as pd

#Importing Deep learning Modules

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
#dataset

df = pd.read_csv('thedataset.csv')


In [3]:
df.tail()

Unnamed: 0,Eng,Beng
10251,Tom told Mary that he was going to kill himsel...,"টম মেরিকে বললো যে ও নিজেকে হত্যা করতে চলেছিলো,..."
10252,Tom's an irritating person to work with becaus...,টমের সঙ্গে কাজ করা খুব বিরক্তিকর কারণ ও কখনই ম...
10253,"I thought doing this would be easy, but we've ...","আমি ভেবেছিলাম এটা করা সহজ হবে, কিন্তু আমরা সার..."
10254,"I thought that doing this would be easy, but w...","আমি ভেবেছিলাম এটা করা সহজ হবে, কিন্তু আমরা সার..."
10255,"January, February, March, April, May, June, Ju...","বছরের বারোটা মাস হলো জানুয়ারি, ফেব্রুয়ারি, ম..."


In [4]:
# to List

english_sentences = df["Eng"].tolist()
bengali_sentences = df["Beng"].tolist()

In [5]:
# English words to sequence

tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

In [6]:
# Bengali words to sequence

tokenizer_beng = Tokenizer()
tokenizer_beng.fit_on_texts(bengali_sentences)
beng_seq = tokenizer_beng.texts_to_sequences(bengali_sentences)

In [7]:
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_beng = len(tokenizer_beng.word_index) + 1

In [None]:
#Padding
max_length = max(len(seq) for seq in eng_seq + beng_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
beng_seq_padded = pad_sequences(beng_seq, maxlen=max_length, padding='post')
eng_seq_padded

In [9]:
embedding_dim = 256
units = 512

In [10]:
# Encoder

encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [11]:
# Decoder

decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_beng, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_beng, activation='softmax')
output = decoder_dense(decoder_outputs)

In [12]:
# Model
model = Model([encoder_inputs, decoder_inputs], output)

In [13]:
# Compilation of model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(eng_seq_padded, beng_seq_padded, test_size=0.2)

In [None]:
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=100, batch_size=64)

In [16]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)
    print(seq)
    
    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_beng.index_word:
            translated_sentence.append(tokenizer_beng.index_word[i])
        else:
            translated_sentence.append(' ')  
        
    return ' '.join(translated_sentence)

In [None]:
input_sentence = input('Enter your sentence:')
translated_sentence = translate_sentence(input_sentence)
print(f"Translated: {translated_sentence}")