#**Libraries**

In [45]:
import string
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers

# **Loading the Data and Configuration**


In [2]:
# Read the text file
with open('ara.txt', 'r', encoding='utf-8') as file:
    data = file.read()  # Read the entire file content

In [30]:
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

# **Text Cleaning**

In [12]:
# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
data = data.translate(translator)

# Convert the text to lowercase
data = data.lower()  # Convert all characters to lowercase

# Define a function to remove diacritics (tashkeel)
def remove_diacritics(text):
    # Diacritics pattern: this regex matches common Arabic diacritics
    diacritics_pattern = r'[\u064B-\u0652\u0654-\u0655]'
    # Substitute diacritics with an empty string
    return re.sub(diacritics_pattern, '', text)

# Clean the Arabic text
data = remove_diacritics(data)

# Save the lowercase text to a new file (optional)
with open('cleaned_text.txt', 'w', encoding='utf-8') as lowercase_file:
    lowercase_file.write(data)

In [13]:
# Printing first 15 lines in data file
num_lines = 15

# Read the text file
with open('cleaned_text.txt', 'r', encoding='utf-8') as file:
    # Using list comprehension to read the first `num_lines`
    lines = [next(file) for _ in range(num_lines)]

# Step 2: Print the lines
for line in lines:
    print(line, end='')  # The `end=''` prevents adding extra new lines

hi	مرحبا	ccby 20 france attribution tatoebaorg 538123 cm  629296 samer
run	اركض	ccby 20 france attribution tatoebaorg 906328 papabear  1245450 saeb
duck	اخفض رأسك	ccby 20 france attribution tatoebaorg 280158 cm  9036391 keeichi
duck	اخفضي رأسك	ccby 20 france attribution tatoebaorg 280158 cm  9036392 keeichi
duck	اخفضوا رؤوسكم	ccby 20 france attribution tatoebaorg 280158 cm  9036393 keeichi
help	النجدة	ccby 20 france attribution tatoebaorg 435084 lukaszpp  371293 saeb
jump	اقفز	ccby 20 france attribution tatoebaorg 1102981 jamessilver  6009426 damascene
stop	قف	ccby 20 france attribution tatoebaorg 448320 cm  1245447 saeb
stop	توقف 	ccby 20 france attribution tatoebaorg 448320 cm  5496702 wildflower81
wait	إنتظر	ccby 20 france attribution tatoebaorg 1744314 belgavox  5496709 wildflower81
go on	داوم	ccby 20 france attribution tatoebaorg 2230774 ck  5118652 damascene
go on	استمر	ccby 20 france attribution tatoebaorg 2230774 ck  5118653 damascene
hello	مرحبا	ccby 20 france attribution tato

# **Prepare the data**


In [33]:
# Data Initialization / Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

In [34]:
# Read Cleaned and Process the File
with open('cleaned_text.txt', "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

In [35]:
# Populate Input and Target Texts
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n" # start of the sequence (SOS) is /t and EOS /n
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [36]:
# Character Information
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters) # Store the number of unique tokens in the input texts.
num_decoder_tokens = len(target_characters) # Store the number of unique tokens in the target texts.
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [37]:
# Summary of Information
print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 38
Number of unique output tokens: 74
Max sequence length for inputs: 35
Max sequence length for outputs: 53


In [38]:
# Character Index Maps
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [39]:
# Initialize Sequence Arrays
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype="float32",
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32",
)

In [40]:
# Populate the Input and Output Arrays
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

# **Model Buliding and Training**

In [46]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [48]:
model.summary()

In [47]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)
# Save model
model.save("seq2seq_arabic2english_model.keras")

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 453ms/step - accuracy: 0.6898 - loss: 1.5874 - val_accuracy: 0.6009 - val_loss: 1.6918
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 580ms/step - accuracy: 0.7249 - loss: 1.0829 - val_accuracy: 0.6007 - val_loss: 1.5348
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 466ms/step - accuracy: 0.7296 - loss: 1.0130 - val_accuracy: 0.5997 - val_loss: 1.4514
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 449ms/step - accuracy: 0.7453 - loss: 0.9413 - val_accuracy: 0.6287 - val_loss: 1.3569
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 446ms/step - accuracy: 0.7577 - loss: 0.9071 - val_accuracy: 0.6543 - val_loss: 1.2825
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 447ms/step - accuracy: 0.7644 - loss: 0.8665 - val_accuracy: 0.6553 - val_loss: 1.2690
Epoch 7/10

# **Run inference (Sampling)**


In [49]:
# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("seq2seq_arabic2english_model.keras")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [50]:
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: hi
Decoded sentence: أنا توم المارة

-
Input sentence: run
Decoded sentence: أنا توم المارة

-
Input sentence: duck
Decoded sentence: أنا توم المارة

-
Input sentence: duck
Decoded sentence: أنا توم المارة

-
Input sentence: duck
Decoded sentence: أنا توم المارة

-
Input sentence: help
Decoded sentence: أنا أن أن أن أن أن المار

-
Input sentence: jump
Decoded sentence: أنا توم المارة

-
Input sentence: stop
Decoded sentence: أنا توم المارة

-
Input sentence: stop
Decoded sentence: أنا توم المارة

-
Input sentence: wait
Decoded sentence: أنا توم المارة

-
Input sentence: go on
Decoded sentence: أنا توم المارة

-
Input sentence: go on
Decoded sentence: أنا توم المارة

-
Input sentence: hello
Decoded sentence: أنا أن أن أن أن أن المار

-
Input sentence: hello
Decoded sentence: أنا أن أن أن أن أن المار

-
Input sentence: hello
Decoded sentence: أنا أن أن أن أن أن المار

-
Input sentence: hurry
Decoded sentence: أنا أن أن أن أن أن المار

-
Input sentence: hurry
Decoded sen