In [None]:
import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, Dot, Concatenate, Bidirectional, Permute
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy


In [None]:
path_to_data = '/content/ara_eng.txt'
translation_file = open(path_to_data, "r", encoding='utf-8')
raw_data = translation_file.read()
translation_file.close()


In [None]:
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[1000:11000]


In [None]:
import string

def clean_sentence(sentence):
    lower_case_sent = sentence.lower()
    string_punctuation = string.punctuation + "¡" + '¿'

    for punctuation in string_punctuation:
        lower_case_sent = lower_case_sent.replace(punctuation, '')

    return lower_case_sent

In [None]:
def tokenize(sentences):
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer


In [None]:
english_sentences = []
arabic_sentences = []

for pair in pairs:
    english_sentence = clean_sentence(pair[0])
    arabic_sentence = clean_sentence(pair[1])

    english_sentences.append(english_sentence)
    arabic_sentences.append(arabic_sentence)

#here column 0 and column 1


In [None]:
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)
ara_text_tokenized, ara_text_tokenizer = tokenize(arabic_sentences)


In [None]:
english_vocab = len(eng_text_tokenizer.word_index) + 1 # By adding 1 to the vocabulary length, we ensure that we reserve index 0 for this special token.
arabic_vocab = len(ara_text_tokenizer.word_index) + 1


In [None]:
english_vocab

4086

In [None]:
arabic_vocab

11891

In [None]:
max_english_len = int(len(max(eng_text_tokenized, key=len)))
max_arabic_len = int(len(max(ara_text_tokenized, key=len)))
#For padding


'\n These lines calculate the maximum length of the tokenized English and Arabic sentences and convert them to integers.\nThese maximum lengths will be later used for padding the tokenized sequences to a fixed length, ensuring all sentences have the same shape and can be processed by the model efficiently.\n'

In [None]:
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding="post")
ara_pad_sentence = pad_sequences(ara_text_tokenized, max_arabic_len, padding="post")

'\n padding="post": This parameter specifies the padding position. It can take one of two values: "pre" or "post". Here, "post" is specified, indicating that padding will\n  be added at the end (or right) of each sequence.\n Use "post" padding when:\n\nThe order of the elements in the sequence is important, and the padding should be added at the end of the sequence.\nThe end of the sequence carries more significance or represents a meaningful boundary.\nFor tasks like language modeling, where the model needs to predict the next word in the sequence, it is common to pad the sequences at the end.\n'

In [None]:
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)
ara_pad_sentence = ara_pad_sentence.reshape(*ara_pad_sentence.shape, 1)
# reshape the padded English and Arabic sentences to add a third dimension of size 1


'\nUsing the * operator in this context simplifies the code and makes it more readable by avoiding the need to manually extract the dimensions from the shape tuple.\n\nIn summary, the * operator is used to unpack the elements of an iterable and pass them as separate arguments to a function or method, providing a convenient way to work with tuples or lists of values.\n'

In [None]:
input_sequence = Input(shape=(max_english_len,), dtype='int32')
embedding = Embedding(input_dim=english_vocab, output_dim=128)(input_sequence)
#define the input layer (input_sequence) and apply an embedding layer (Embedding) to the input.

In [None]:
# Encoder
input_sequence = Input(shape=(max_english_len,))
embedding = Embedding(input_dim=english_vocab, output_dim=256)(input_sequence)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(256, return_sequences=True, return_state=True))(embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# Decoder
decoder_inputs = Input(shape=(max_arabic_len,))
decoder_embedding = Embedding(input_dim=arabic_vocab, output_dim=256)(decoder_inputs)
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)  # Update the LSTM size to 512
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention
attention = Dot(axes=[2, 2])([decoder_outputs, encoder_outputs])
attention = Activation('softmax', name='attention')(attention)
context = Dot(axes=[2, 1])([attention, encoder_outputs])

decoder_combined_context = Concatenate()([context, decoder_outputs])

# Output layer
decoder_dense = Dense(arabic_vocab, activation='softmax')
output = decoder_dense(decoder_combined_context)

# model
enc_dec_model = Model([input_sequence, decoder_inputs], output)

learning_rate = 0.01
optimizer = Adam(learning_rate)

enc_dec_model.compile(loss=sparse_categorical_crossentropy,
                      optimizer=optimizer,
                      metrics=['accuracy'])
enc_dec_model.summary()

batch_size = 32
num_batches = len(eng_pad_sentence) // batch_size

#valadation
val_size = 1000
eng_pad_val = eng_pad_sentence[-val_size:]
ara_pad_val = ara_pad_sentence[-val_size:]

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0

    for batch in range(num_batches):
        # Extract a mini-batch
        start_index = batch * batch_size
        end_index = (batch + 1) * batch_size
        eng_batch = eng_pad_sentence[start_index:end_index]
        ara_batch = ara_pad_sentence[start_index:end_index]

        # Update the model's parameter s for better run
        loss, accuracy = enc_dec_model.train_on_batch([eng_batch, ara_batch], ara_batch)

        total_loss += loss
        total_accuracy += accuracy


    val_loss, val_accuracy = enc_dec_model.evaluate([eng_pad_val, ara_pad_val], ara_pad_val, verbose=0)
    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches

    print("Epoch: {}/{} - Avg. Loss: {:.4f} - Avg. Accuracy: {:.4f} - Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(
        epoch + 1, num_epochs, avg_loss, avg_accuracy, val_loss, val_accuracy))



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 20, 256)      1046016     ['input_2[0][0]']                
                                                                                                  
 input_3 (InputLayer)           [(None, 17)]         0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 20, 512),    1050624     ['embedding_1[0][0]']            
                                 (None, 256),                                                 

In [None]:
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# Example sentence translation
index = 10
print("The English sentence is: {}".format(english_sentences[index]))
print("The Arabic sentence is: {}".format(arabic_sentences[index]))
print('The predicted Arabic sentence is:')
predicted_sentence = logits_to_sentence(
    enc_dec_model.predict([eng_pad_sentence[index:index + 1], ara_pad_sentence[index:index + 1]])[0],
    ara_text_tokenizer)
print(predicted_sentence)


The English sentence is: were obedient
The Arabic sentence is: نحن مطيعون
The predicted Arabic sentence is:
نحن مطيعون               
