<a href="https://colab.research.google.com/github/RubaHesham11/machine-translation/blob/main/machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/SamirMoustafa/nmt-with-attention-for-ar-to-en.git

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load and preprocess the dataset
with open('/content/nmt-with-attention-for-ar-to-en/ara_.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split data into source and target sentences
source_sentences = []
target_sentences = []
for line in lines:
    parts = line.strip().split('\t')
    source_sentences.append(parts[0])
    target_sentences.append(parts[1])

# Tokenize the sentences and convert them to sequences of integers
source_tokenizer = Tokenizer()
source_tokenizer.fit_on_texts(source_sentences)

# Modify the tokenization for the target language to include '<start>' token
target_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
target_sentences_with_start = ['<start> ' + text for text in target_sentences]  # Add '<start>' to the beginning of each sentence
target_tokenizer.fit_on_texts(target_sentences_with_start)

# Ensure '<start>' is added to the vocabulary and padding index is reserved
target_tokenizer.word_index['<start>'] = len(target_tokenizer.word_index) + 1
target_tokenizer.index_word[len(target_tokenizer.word_index)] = '<start>'
target_tokenizer.word_index['<pad>'] = 0

source_sequences = source_tokenizer.texts_to_sequences(source_sentences)
target_sequences = target_tokenizer.texts_to_sequences(target_sentences_with_start)

# Pad sequences to a fixed length
max_source_length = max(len(seq) for seq in source_sequences)
max_target_length = max(len(seq) for seq in target_sequences)
padded_source_sequences = pad_sequences(source_sequences, maxlen=max_source_length, padding='post')
padded_target_sequences = pad_sequences(target_sequences, maxlen=max_target_length, padding='post')

# Create numpy arrays for training data
encoder_input_data = np.array(padded_source_sequences)
decoder_input_data = np.array(padded_target_sequences[:, :-1])  # Remove the last token
decoder_target_data = np.array(padded_target_sequences[:, 1:])   # Remove the first token


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
encoder_input_train, encoder_input_test, decoder_input_train, decoder_input_test, decoder_target_train, decoder_target_test = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("Shape of encoder input train set:", encoder_input_train.shape)
print("Shape of decoder input train set:", decoder_input_train.shape)
print("Shape of decoder target train set:", decoder_target_train.shape)
print("Shape of encoder input test set:", encoder_input_test.shape)
print("Shape of decoder input test set:", decoder_input_test.shape)
print("Shape of decoder target test set:", decoder_target_test.shape)


In [None]:
#trying

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Attention, Dropout

# Define model hyperparameters
embedding_dim = 256
hidden_units = 1024
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
dropout_rate = 0.2  # Adjust as needed

# Define encoder input
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(source_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(hidden_units, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# Define decoder input
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention mechanism
attention = Attention()
context_vector = attention([decoder_outputs, encoder_outputs])

# Concatenate context vector and decoder output
decoder_combined_context = Concatenate(axis=-1)([decoder_outputs, context_vector])

# Apply dropout for regularization
decoder_dropout = Dropout(dropout_rate)
decoder_outputs = decoder_dropout(decoder_combined_context)

# Output layer
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit([encoder_input_train, decoder_input_train], decoder_target_train, batch_size=64, epochs=50, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
loss, accuracy = model.evaluate([encoder_input_test, decoder_input_test], decoder_target_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.214596152305603
Test Accuracy: 0.9015691876411438
