RNN Encoder-Decoder model with attention that can translate English text to Arabic, given one file that contains English and its translated Arabic text

# Importing Required Library

In [35]:
import numpy as np
import pandas as pd 
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten,LSTM, Bidirectional,Concatenate , dot ,Activation, Concatenate,Dot
from tensorflow.keras import Input, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import RMSprop

# Reading Data and converting it to DataFrame


In [20]:
English = []
Arabic = []
with open('/content/ara_eng.txt', 'r',encoding='utf-8') as file:
     lines = file.read().split("\n")[:-1]
     for line in lines:
       English.append(line.split("\t")[0])
       Arabic.append(line.split("\t")[1])

data = {"English": English, "Arabic": Arabic}
df = pd.DataFrame(data)
df = df.iloc[0:10000]
df["Arabic"] = "<start>" + df["Arabic"] + "<end>"

# Tokenization

In [21]:
English_Tokenizer = Tokenizer()
English_Tokenizer.fit_on_texts(df['English'])
English_Encoded = English_Tokenizer.texts_to_sequences(df['English'])

Arabic_Tokenizer = Tokenizer()
Arabic_Tokenizer.fit_on_texts(df['Arabic'])
Arabic_Encoded = Arabic_Tokenizer.texts_to_sequences(df['Arabic'])

# Extracting Vocab size


In [22]:
English_Vocab_Size = len(English_Tokenizer.word_counts)+1
Arabic_Vocab_Size = len(Arabic_Tokenizer.word_counts)+1

#Extracting Maximum Sequence Length

In [23]:

Arabic_Seq_Len = 0
for i in range(len(Arabic_Encoded)):
  if len(Arabic_Encoded[i]) > Arabic_Seq_Len:
    Arabic_Seq_Len= len(Arabic_Encoded[i])
English_Seq_Len = Arabic_Seq_Len

# Padding

In [24]:

English_Padding = pad_sequences(English_Encoded, maxlen=Arabic_Seq_Len, padding='post')
Arabic_Padding = pad_sequences(Arabic_Encoded, maxlen=English_Seq_Len, padding='post')

English_Padding= np.array(English_Padding)
Arabic_Padding= np.array(Arabic_Padding)

# Splitting Data

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(English_Padding, Arabic_Padding, test_size=0.1, random_state=0)


# Encoder Model with Bidirectional LSTM and Concatenation

In [26]:
Encoder_Input = Input(shape=(English_Seq_Len,)) 
Encoder_Embedding = Embedding(English_Vocab_Size, 128)(Encoder_Input)

Encoder_BiLstm = Bidirectional(LSTM(256, return_sequences=True, return_state=True))
Encoder_Result = Encoder_BiLstm(Encoder_Embedding)
Encoder_Output = Encoder_Result[0]
S1 = Concatenate()([Encoder_Result[1], Encoder_Result[3]])
S2 = Concatenate()([Encoder_Result[2], Encoder_Result[4]])



# Decoder Model with LSTM and Initial State Initialization

In [27]:
Decoder_Input = Input(shape=(Arabic_Seq_Len,)) 
Decoder_Embedding = Embedding(Arabic_Vocab_Size, 128)(Decoder_Input) 


Decoder_Lstm = LSTM(512, return_sequences=True, return_state=True) 
Decoder_Result = Decoder_Lstm(Decoder_Embedding, initial_state=[S1,S2])
Decoder_Output = Decoder_Result[0]

# Attention Mechanism in Decoder Model for Sequence-to-Sequence Tasks

In [28]:
attention = Dense(1, activation='tanh')(Encoder_Output)
attention = Activation('softmax')(attention)

context = Concatenate(axis=2)([Encoder_Output, attention])
context = Dense(512)(context)

Decoder_Output = Concatenate(axis=-1)([context, Decoder_Output])

decoder_dense = Dense(Arabic_Vocab_Size, activation="softmax")
Decoder_Output = decoder_dense(Decoder_Output)


# Model


In [29]:
model = Model([Encoder_Input, Decoder_Input], Decoder_Output) 

In [30]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 16, 128)      464768      ['input_3[0][0]']                
                                                                                                  
 bidirectional_1 (Bidirectional  [(None, 16, 512),   788480      ['embedding_2[0][0]']            
 )                               (None, 256),                                                     
                                 (None, 256),                                                     
                                 (None, 256),                                               

# Model Traning

In [31]:
encoder_input_data = X_train
decoder_input_data = y_train
decoder_target_data =  y_train

encoder_input_test = X_test
decoder_input_test = y_test
decoder_target_test=  y_test


In [32]:

checkpoint = ModelCheckpoint("give Your path to save check points", monitor='val_accuracy')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]

In [34]:
from keras.optimizers import RMSprop

model.compile(
    optimizer=RMSprop(lr=0.01),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=128,
    epochs=10,
    validation_data = ([encoder_input_test, decoder_input_test],decoder_target_test),
                    callbacks= callbacks_list)


                   

Epoch 1/10


  super().__init__(name, **kwargs)






Epoch 2/10



Epoch 3/10



Epoch 4/10



Epoch 5/10



Epoch 6/10



Epoch 7/10



Epoch 8/10



Epoch 9/10



Epoch 10/10





<keras.callbacks.History at 0x7f02d2b74a90>

# Sentence Translation

In [38]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = ''
    
    predictions = np.argmax(logits, axis=1)
    predicted_words = [index_to_words[prediction] for prediction in predictions]
    predicted_sentence = ' '.join(predicted_words)
    
    return predicted_sentence


# Example sentence translation
index = 1000
print("The English sentence is",df['English'][index])
print("The Arabic sentence is: ",df['Arabic'][index])
print('The predicted Arabic sentence is:')
predicted_sentence = logits_to_sentence(
    model.predict([English_Padding[index:index + 1], Arabic_Padding[index:index + 1]])[0],
    Arabic_Tokenizer)
print(predicted_sentence)


The English sentence is Tom looks pale.
The Arabic sentence is:  <start>يبدو توم شاحب الوجه.<end>
The predicted Arabic sentence is:
start يبدو توم شاحب الوجه end          
