## Load The Data

In [1]:
import pandas as pd
import numpy as np

lines = pd.read_csv('C:/Users/suras/Downloads/archive/merge_df.csv', header=0)
lines=lines.iloc[1:5000] 
lines

Unnamed: 0,english,arabic
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!
5,Go on.,داوم.
...,...,...
4995,Tom is reading the Bible.,توم يقرأ الإنجيل.
4996,Tom is wanted for murder.,توم مطلوب بتهمة القتل.
4997,Tom married a local girl.,توم تجوز فتاة محلية
4998,Tom often goes to Boston.,يذهب توم إلى بوسطن كثيرًا.


## Design an Encoder & Decoder for the Model
## Preprocessing & Tokenizing the Data

In [2]:
# Encoder Input :: English Sentences
from tensorflow.keras import preprocessing
eng_lines = list()
for line in lines.english:
  eng_lines.append(line)
    
tokenizer= preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(eng_lines)
tokenized_eng_lines =tokenizer.texts_to_sequences(eng_lines)

length_list=list()
for tokens in tokenized_eng_lines:
  length_list.append(len(tokens))
    
max_input_length= np.array(length_list).max()

print("English sentence max length ::", max_input_length)
padded_eng_lines= preprocessing.sequence.pad_sequences(tokenized_eng_lines, maxlen=max_input_length, padding='post')

encoder_input_data=np.array(padded_eng_lines)
print('Encoder Input Data shape ::', encoder_input_data.shape)

eng_word_dict= tokenizer.word_index
num_eng_words= len(eng_word_dict)+1

print('Number of English tokens ::', num_eng_words) 

English sentence max length :: 7
Encoder Input Data shape :: (4999, 7)
Number of English tokens :: 2011


In [3]:
# Decoder Input :: Arabic Sentences
ara_lines = list()
for line in lines.arabic:
    ara_lines = ['<START> ' + line + ' <END>' for line in lines.arabic]


tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(ara_lines)
tokenized_ara_lines = tokenizer.texts_to_sequences(ara_lines)

length_list = list()
for tokens in tokenized_ara_lines:
    length_list.append(len(tokens))

max_output_length = np.array(length_list).max()
print('Arabic sentence max length ::', max_output_length)


padded_ara_lines = preprocessing.sequence.pad_sequences(tokenized_ara_lines, maxlen=max_output_length, padding='post')

decoder_input_data=np.array(padded_ara_lines)
print('Decoder Input Data Shape ::', decoder_input_data.shape)

ara_word_dict = tokenizer.word_index
num_ara_tokens = len(ara_word_dict) + 1

print('Number of Arabic tokens ::', num_ara_tokens)


Arabic sentence max length :: 10
Decoder Input Data Shape :: (4999, 10)
Number of Arabic tokens :: 5041


In [4]:
# Decoder Output :: Arabic Sentences -Preprocessed-
from tensorflow.keras import utils
decoder_output_data=list()
for token in tokenized_ara_lines:
   decoder_output_data.append(token[1:])

padded_ara_lines=preprocessing.sequence.pad_sequences(decoder_output_data,maxlen=max_output_length,padding='post')
onehot_ara_lines= utils.to_categorical(padded_ara_lines,num_ara_tokens)

decoder_output_data=np.array(onehot_ara_lines)
print('Decoder Output Data Shape ::',decoder_output_data.shape)


Decoder Output Data Shape :: (4999, 10, 5041)


## Building LSTM model



In [5]:
import tensorflow.keras as k
from tensorflow.keras import layers, activations, models, preprocessing, utils
encoder_inputs= k.layers.Input(shape=(None,))
encoder_embedding= k.layers.Embedding(num_eng_words, 256, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c= k.layers.LSTM(128, return_state=True)(encoder_embedding)
encoder_states= [state_h, state_c]

decoder_inputs= k.layers.Input (shape=(None,))
decoder_embedding= k.layers.Embedding(num_ara_tokens, 256, mask_zero=True)(decoder_inputs)
decoder_lstm= k.layers.LSTM(128, return_state=True, return_sequences=True)
decoder_outputs,_,_= decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense= k.layers.Dense(num_ara_tokens, activation= k.activations.softmax)
output= decoder_dense(decoder_outputs)

model= k.models.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer= k.optimizers.RMSprop(), loss='categorical_crossentropy')
model.summary()

## Train The Model

In [6]:
model.fit([encoder_input_data,decoder_input_data],decoder_output_data,batch_size=128,epochs=200)

Epoch 1/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 143ms/step - loss: 8.1172
Epoch 2/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 141ms/step - loss: 5.3386
Epoch 3/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 143ms/step - loss: 5.0386
Epoch 4/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 147ms/step - loss: 4.7693
Epoch 5/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 158ms/step - loss: 4.6357
Epoch 6/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 170ms/step - loss: 4.5757
Epoch 7/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 145ms/step - loss: 4.5306
Epoch 8/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 143ms/step - loss: 4.4975
Epoch 9/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 144ms/step - loss: 4.4743
Epoch 10/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 143ms

<keras.src.callbacks.history.History at 0x18e176d5760>

In [7]:
model.save('model.keras')

In [8]:
def make_reference_models():
    encoder_model_reference=k.models.Model(encoder_inputs,encoder_states)

    decoder_state_h=k.layers.Input(shape=(128,))
    decoder_state_c=k.layers.Input(shape=(128,))
    decoder_inputs_states=[decoder_state_h,decoder_state_c]

    decoder_outputs,state_h,state_c=decoder_lstm(decoder_embedding,initial_state=decoder_inputs_states)

    decoder_states=[state_h,state_c]
    decoder_outputs=decoder_dense(decoder_outputs)
    decoder_model_reference=k.models.Model([decoder_inputs]+decoder_inputs_states,[decoder_outputs]+decoder_states)


    return encoder_model_reference,decoder_model_reference

## Functions :: Convert a Sentence from a String to Tokens

In [9]:
def str_to_tokens(sentence:str):
    words= sentence.lower().split()
    token_list= list()
    for word in words:
        try:
            token_list.append(eng_word_dict[word])
        except:
            print("The sentence is not recognised, please try again")
            run()
    return preprocessing.sequence.pad_sequences([token_list], maxlen=max_input_length, padding='post')

## The Testing

In [12]:
def run():    
    enc_model, dec_model= make_reference_models()
    
    for sen in range(encoder_input_data.shape[0]):
        states_values= enc_model.predict(str_to_tokens (input('Enter an English sentence: ')))
        
        empty_target_seq= np.zeros((1,1))
        empty_target_seq[0,0]= ara_word_dict['start']
        stop_condition= False
        decoded_translation=''
        
        while not stop_condition:
            dec_output, h, c= dec_model.predict([empty_target_seq]+ states_values)
            sampled_word_index= np.argmax(dec_output[0,-1,:])
            sampled_word= None
            for word, index in ara_word_dict.items():
                if sampled_word_index== index:
                    decoded_translation+= ' {}'.format(word)
                    sampled_word=word
                if sampled_word== 'end' or len(decoded_translation.split()) > max_output_length:
                    stop_condition = True
            empty_target_seq= np.zeros((1,1))
            empty_target_seq[0,0]= sampled_word_index
            states_values= [h, c]
            
        print(decoded_translation[:-3])

In [13]:
run()

Enter an English sentence:  Run


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
 اركض 


Enter an English sentence:  Help


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
 النجدة 


Enter an English sentence:  Tom married a local girl


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
 توم رجل فتاة محلية 


Enter an English sentence:  Tom is reading the Bible


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
 توم يقرأ كتاباً 


Enter an English sentence:  Go on


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
 استمر 


Enter an English sentence:  free palestine 


The sentence is not recognised, please try again


KeyboardInterrupt: Interrupted by user