In [None]:
import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
!pip install transformers
from transformers import TFAutoModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
path_to_data = '/content/ara_eng.txt'
translation_file = open(path_to_data, "r", encoding='utf-8')
raw_data = translation_file.read()
translation_file.close()


In [None]:
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[1000:11000]




In [None]:
import string

def clean_sentence(sentence):
    lower_case_sent = sentence.lower()
    string_punctuation = string.punctuation + "¡" + '¿'

    for punctuation in string_punctuation:
        lower_case_sent = lower_case_sent.replace(punctuation, '')

    return lower_case_sent


In [None]:
def tokenize(sentences):
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer


In [None]:
english_sentences = []
arabic_sentences = []

for pair in pairs:
    english_sentence = clean_sentence(pair[0])
    arabic_sentence = clean_sentence(pair[1])


    english_sentences.append(english_sentence)
    arabic_sentences.append(arabic_sentence)

#here column 0 and column 1


In [None]:
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)
ara_text_tokenized, ara_text_tokenizer = tokenize(arabic_sentences)


In [None]:
english_vocab = len(eng_text_tokenizer.word_index) + 1 # By adding 1 to the vocabulary length, we ensure that we reserve index 0 for this special token.
arabic_vocab = len(ara_text_tokenizer.word_index) + 1



In [None]:
english_vocab


4086

In [None]:
arabic_vocab

11891

In [None]:
max_english_len = int(len(max(eng_text_tokenized, key=len)))
max_arabic_len = int(len(max(ara_text_tokenized, key=len)))
#For padding


In [None]:
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding="post")
ara_pad_sentence = pad_sequences(ara_text_tokenized, max_arabic_len, padding="post")


In [None]:
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)
ara_pad_sentence = ara_pad_sentence.reshape(*ara_pad_sentence.shape, 1)
# reshape the padded English and Arabic sentences to add a third dimension of size 1



In [None]:
input_sequence = Input(shape=(max_english_len,), dtype='int32')
embedding = Embedding(input_dim=english_vocab, output_dim=128)(input_sequence)
#define the input layer (input_sequence) and apply an embedding layer (Embedding) to the input.

^^^^^^^
output_dim=128: The output_dim parameter sets the dimensionality of the dense embedding vectors. In this case, the embedding layer will generate dense vectors of size 128 for each word in the input sequence. This means each word in the vocabulary will be represented by a vector of length 128.
 128 (or similar values) are often chose


**bert modet**


In [None]:
from transformers import TFAutoModel
transformer_model = TFAutoModel.from_pretrained("bert-base-multilingual-cased")

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
encoder_outputs = transformer_model(input_sequence)[0]
decoder_inputs = Input(shape=(max_arabic_len,), dtype='int32')

In [None]:
# Embedding layer for the decoder inputs
decoder_embedding = Embedding(input_dim=arabic_vocab, output_dim=128)(decoder_inputs)

encoder_lstm = LSTM(64, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(embedding)

decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

In [None]:
#here is the Dense (fully connected) layer for the decoder output
decoder_dense = Dense(arabic_vocab, activation='softmax')
output = decoder_dense(decoder_outputs)


In [None]:

enc_dec_model = Model([input_sequence, decoder_inputs], output)

# optimizer Adam with a learning rate of 0.001
optimizer = Adam(learning_rate=0.001)


enc_dec_model.compile(optimizer=optimizer, loss=sparse_categorical_crossentropy, metrics=['accuracy'])
enc_dec_model.summary()


batch_size = 32
num_batches = len(eng_pad_sentence) // batch_size

# Define the validation set
val_size = 1000
eng_pad_val = eng_pad_sentence[-val_size:]
ara_pad_val = ara_pad_sentence[-val_size:]

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 17)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 128)      523008      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 17, 128)      1522048     ['input_2[0][0]']                
                                                                                              

In [None]:
num_epochs = 15
total_loss = 0
total_accuracy = 0

for epoch in range(num_epochs):
    for batch in range(num_batches):
        start_index = batch * batch_size
        end_index = (batch + 1) * batch_size
        eng_batch = eng_pad_sentence[start_index:end_index]
        ara_batch = ara_pad_sentence[start_index:end_index]

        loss, accuracy = enc_dec_model.train_on_batch([eng_batch, ara_batch], ara_batch)

        total_loss += loss
        total_accuracy += accuracy

    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches

    # Validation loss and accuracy
    val_loss, val_accuracy = enc_dec_model.evaluate([eng_pad_val, ara_pad_val], ara_pad_val, verbose=0)

    print("Epoch: {}/{} - Avg. Loss: {:.4f} - Avg. Accuracy: {:.4f} - Val Loss: {:.4f} - Val Accuracy: {:.4f}".format(
        epoch + 1, num_epochs, avg_loss, avg_accuracy, val_loss, val_accuracy))


Epoch: 1/15 - Avg. Loss: 3.3035 - Avg. Accuracy: 0.7275 - Val Loss: 3.6027 - Val Accuracy: 0.5478


In [None]:
# convert logits to a sentence
def logits_to_sentence(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])