## Machine translation from english to arabic Project

## Import Libraries

In [4]:
# prompt: install dependencies for datasets to use load_dataset

# ## Machine translation from english to arabic Project
!pip install datasets transformers[sentencepiece] sacrebleu





In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Embedding, LayerNormalization, Dropout, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
from nltk.translate.bleu_score import corpus_bleu
from datasets import load_dataset

## Search for Data

In [5]:
# Loading the parallel-sentences-global-voices dataset from Hugging Face
dataset = load_dataset("sentence-transformers/parallel-sentences-global-voices", name="en-ar", split="train", trust_remote_code=True)

# Extract English and Arabic sentences
data = {
    'english': [example['english'] for example in dataset],
    'arabic': [example['non_english'] for example in dataset]
}
df = pd.DataFrame(data)
data = df[['english', 'arabic']].dropna()
data['english'] = data['english'].astype(str).str.strip()
data['arabic'] = data['arabic'].astype(str).str.strip()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Perform Preprocessing

Tokenizing and padding the sentences, adding start/end tokens for Arabic

In [6]:
# Add start and end tokens to Arabic sentences
data['arabic'] = data['arabic'].apply(lambda x: '<start> ' + x + ' <end>')

# Limit to 10000 samples for computational efficiency, or use all if fewer
data = data.sample(n=min(10000, len(data)), random_state=42).reset_index(drop=True)


In [7]:
def tokenize_and_pad(texts, max_len=None):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded, tokenizer, max_len

In [8]:
# Tokenize and pad English and Arabic sentences
eng_padded, eng_tokenizer, eng_max_len = tokenize_and_pad(data['english'])
ar_padded, ar_tokenizer, ar_max_len = tokenize_and_pad(data['arabic'])

In [9]:
# Split data into train, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(
    eng_padded, ar_padded, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42
)

## Build The Model

Defining custom attention mechanisms and Transformer components

In [10]:
class CustomMultiHeadAttention(Layer):
    def __init__(self, embed_dim, num_heads):
        super(CustomMultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.depth = embed_dim // num_heads

        self.wq = Dense(embed_dim)
        self.wk = Dense(embed_dim)
        self.wv = Dense(embed_dim)
        self.dense = Dense(embed_dim)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, query, value, key=None, training=None):
        if key is None:
            key = value
        batch_size = tf.shape(query)[0]

        q = self.wq(query)  # (batch_size, seq_len, embed_dim)
        k = self.wk(key)    # (batch_size, seq_len, embed_dim)
        v = self.wv(value)  # (batch_size, seq_len, embed_dim)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len, depth)

        scaled_attention = tf.matmul(q, k, transpose_b=True)  # (batch_size, num_heads, seq_len, seq_len)
        scaled_attention = scaled_attention / tf.math.sqrt(tf.cast(self.depth, tf.float32))
        attention_weights = tf.nn.softmax(scaled_attention, axis=-1)

        output = tf.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len, depth)
        output = tf.transpose(output, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, depth)
        output = tf.reshape(output, (batch_size, -1, self.embed_dim))  # (batch_size, seq_len, embed_dim)
        output = self.dense(output)
        return output

In [11]:
class AdditiveAttention(Layer):
    def __init__(self, units):
        super(AdditiveAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        # query: (batch_size, dec_seq_len, embed_dim)
        # values: (batch_size, enc_seq_len, embed_dim)

        # Expand dimensions for broadcasting
        query_exp = tf.expand_dims(query, 2)  # (batch_size, dec_seq_len, 1, embed_dim)
        values_exp = tf.expand_dims(values, 1)  # (batch_size, 1, enc_seq_len, embed_dim)

        # Compute score
        score = self.V(tf.nn.tanh(self.W1(query_exp) + self.W2(values_exp)))  # (batch_size, dec_seq_len, enc_seq_len, 1)
        score = tf.squeeze(score, axis=-1)  # (batch_size, dec_seq_len, enc_seq_len)

        # Compute attention weights
        attention_weights = tf.nn.softmax(score, axis=-1)  # (batch_size, dec_seq_len, enc_seq_len)

        # Compute context vector
        context_vector = tf.matmul(attention_weights, values)  # (batch_size, dec_seq_len, embed_dim)

        return context_vector, attention_weights

In [12]:
class Encoder(Layer):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, attention_type='multihead', rate=0.1):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.attention_type = attention_type
        if attention_type == 'multihead':
            self.attention = CustomMultiHeadAttention(embed_dim, num_heads)
        else:
            self.attention = AdditiveAttention(embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        emb = self.embedding(inputs)  # (batch_size, enc_seq_len, embed_dim)
        if self.attention_type == 'multihead':
            attn_output = self.attention(emb, emb)  # (batch_size, enc_seq_len, embed_dim)
        else:
            attn_output, _ = self.attention(emb, emb)  # (batch_size, enc_seq_len, embed_dim)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(emb + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [13]:
class Decoder(Layer):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, attention_type='multihead', rate=0.1):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.attention_type = attention_type
        if attention_type == 'multihead':
            self.self_attention = CustomMultiHeadAttention(embed_dim, num_heads)
            self.enc_attention = CustomMultiHeadAttention(embed_dim, num_heads)
        else:
            self.self_attention = AdditiveAttention(embed_dim)
            self.enc_attention = AdditiveAttention(embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)

    def call(self, inputs, enc_output, training):
        emb = self.embedding(inputs)  # (batch_size, dec_seq_len, embed_dim)
        if self.attention_type == 'multihead':
            self_attn_output = self.self_attention(emb, emb)  # (batch_size, dec_seq_len, embed_dim)
        else:
            self_attn_output, _ = self.self_attention(emb, emb)  # (batch_size, dec_seq_len, embed_dim)
        self_attn_output = self.dropout1(self_attn_output, training=training)
        out1 = self.layernorm1(emb + self_attn_output)

        if self.attention_type == 'multihead':
            enc_attn_output = self.enc_attention(out1, enc_output)  # (batch_size, dec_seq_len, embed_dim)
        else:
            enc_attn_output, _ = self.enc_attention(out1, enc_output)  # (batch_size, dec_seq_len, embed_dim)
        enc_attn_output = self.dropout2(enc_attn_output, training=training)
        out2 = self.layernorm2(out1 + enc_attn_output)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(out2 + ffn_output)

In [14]:
def build_transformer_model(vocab_size_enc, vocab_size_dec, max_len_enc, max_len_dec, attention_type='multihead', embed_dim=256, num_heads=8, ff_dim=512, training=None):
    encoder_inputs = Input(shape=(max_len_enc,))
    decoder_inputs = Input(shape=(max_len_dec,))

    encoder = Encoder(vocab_size_enc, embed_dim, num_heads, ff_dim, attention_type)
    enc_output = encoder(encoder_inputs, training=training)

    decoder = Decoder(vocab_size_dec, embed_dim, num_heads, ff_dim, attention_type)
    dec_output = decoder(decoder_inputs, enc_output, training=training)

    outputs = Dense(vocab_size_dec, activation='softmax')(dec_output)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    return model

## Tune Parameters and Compare Attention Mechanisms

Training models with MultiHead and Additive attention, using early stopping

In [15]:
attention_types = ['multihead', 'additive']
bleu_scores = {}
models = {}

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [16]:
# Callback to evaluate and print test loss and accuracy per epoch
test_callback = LambdaCallback(
    on_epoch_end=lambda epoch, logs: print(f"\nTest Loss: {model.evaluate([X_test, y_test[:, :-1]], y_test[:, 1:, np.newaxis], verbose=0)[0]:.4f}, "
                                          f"Test Accuracy: {model.evaluate([X_test, y_test[:, :-1]], y_test[:, 1:, np.newaxis], verbose=0)[1]:.4f}")
)

In [None]:
for attention_type in attention_types:
    print(f"\nTraining model with {attention_type} attention")
    model = build_transformer_model(
        vocab_size_enc=len(eng_tokenizer.word_index) + 1,
        vocab_size_dec=len(ar_tokenizer.word_index) + 1,
        max_len_enc=eng_max_len,
        max_len_dec=ar_max_len,
        attention_type=attention_type
    )
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Pad the decoder inputs to match max_len_dec
    dec_input_train = pad_sequences(y_train[:, :-1], maxlen=ar_max_len, padding='post')
    dec_input_val = pad_sequences(y_val[:, :-1], maxlen=ar_max_len, padding='post')

    # Pad the target data to match max_len_dec
    target_train = pad_sequences(y_train[:, 1:], maxlen=ar_max_len, padding='post', value=0)
    target_val = pad_sequences(y_val[:, 1:], maxlen=ar_max_len, padding='post', value=0)

    # Train the model with callbacks for test evaluation
    history = model.fit(
        [X_train, dec_input_train], target_train[:, :, np.newaxis],
        validation_data=([X_val, dec_input_val], target_val[:, :, np.newaxis]),
        epochs=10,
        batch_size=64,
        callbacks=[early_stopping, test_callback],
        verbose=1
    )
    models[attention_type] = model


Training model with multihead attention
Epoch 1/20
[1m  9/100[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:30:37[0m 60s/step - accuracy: 0.6329 - loss: 9.6615

## Evaluate The Model

Evaluating both models using BLEU score on test data

In [None]:
def translate_sentence(model, sentence, eng_tokenizer, ar_tokenizer, eng_max_len, ar_max_len):
    seq = eng_tokenizer.texts_to_sequences([sentence])
    enc_input = pad_sequences(seq, maxlen=eng_max_len, padding='post')

    dec_input = np.zeros((1, ar_max_len))
    dec_input[0, 0] = ar_tokenizer.word_index['<start>']

    for i in range(1, ar_max_len):
        pred = model.predict([enc_input, dec_input], verbose=0)
        next_token = np.argmax(pred[0, i-1, :])
        dec_input[0, i] = next_token
        if next_token == ar_tokenizer.word_index['<end>']:
            break

    ar_words = []
    for idx in dec_input[0]:
        if idx == 0 or idx == ar_tokenizer.word_index['<start>'] or idx == ar_tokenizer.word_index['<end>']:
            continue
        word = ar_tokenizer.index_word.get(idx, '')
        if word:
            ar_words.append(word)
    return ' '.join(ar_words)

In [24]:
# Calculate BLEU scores for both models
for attention_type in attention_types:
    model = models[attention_type]
    references = [[ar_tokenizer.sequences_to_texts([y_test[i]])[0].replace('<start>', '').replace('<end>', '').strip().split()] for i in range(len(y_test))]
    candidates = []
    for i in range(len(X_test)):
        eng_sentence = eng_tokenizer.sequences_to_texts([X_test[i]])[0]
        pred_sentence = translate_sentence(model, eng_sentence, eng_tokenizer, ar_tokenizer, eng_max_len, ar_max_len)
        candidates.append(pred_sentence.split())

    bleu_score = corpus_bleu(references, candidates)
    bleu_scores[attention_type] = bleu_score
    print(f'BLEU Score on Test Set ({attention_type} attention): {bleu_score:.4f}')

KeyError: 'multihead'

In [25]:
# Compare attention mechanisms
print("\nAttention Mechanism Comparison:")
for attention_type, score in bleu_scores.items():
    print(f"{attention_type.capitalize()} Attention BLEU Score: {score:.4f}")


Attention Mechanism Comparison:


## Add Test Data with Notebook

Testing with sample sentences using the better-performing model

In [None]:
best_attention = max(bleu_scores, key=bleu_scores.get)
best_model = models[best_attention]
print(f"\nUsing {best_attention} attention model for test translations")

In [None]:
test_sentences = [
    "I love to read books.",
    "The weather is nice today.",
    "Can you help me?",
    "This is a beautiful house."
]


In [None]:
print("\nTest Translations:")
for sentence in test_sentences:
    translation = translate_sentence(best_model, sentence, eng_tokenizer, ar_tokenizer, eng_max_len, ar_max_len)
    print(f"English: {sentence}")
    print(f"Arabic: {translation}\n")