<a href="https://colab.research.google.com/github/Rakitin11/assignment9/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, dot, concatenate, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import requests


In [2]:
# Load dataset from GitHub directly
url = "https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt"
response = requests.get(url)
lines = response.text.strip().split('\n')

arabic_sentences = []
english_sentences = []

for line in lines:
    if '\t' in line:
        eng, ara = line.split('\t')
        arabic_sentences.append(ara)
        english_sentences.append(f"<start> {eng} <end>")


In [3]:
def tokenize_and_pad(sentences, maxlen=None):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(sentences)
    tensor = tokenizer.texts_to_sequences(sentences)
    tensor = pad_sequences(tensor, padding='post', maxlen=maxlen)
    return tensor, tokenizer

input_tensor, inp_tokenizer = tokenize_and_pad(arabic_sentences)
target_tensor, targ_tokenizer = tokenize_and_pad(english_sentences)

input_vocab_size = len(inp_tokenizer.word_index) + 1
target_vocab_size = len(targ_tokenizer.word_index) + 1


In [5]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.2)
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_emb = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(units, return_sequences=True, return_state=True)(encoder_emb)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

# Attention mechanism
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs], axes=[2, 1])
decoder_combined_context = concatenate([context, decoder_outputs])

# Output layer
output = Dense(target_vocab_size, activation='softmax')(decoder_combined_context)

# Model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [6]:
# Shift the target sequences for training the decoder
decoder_target_data = np.zeros_like(target_tensor_train)
decoder_target_data[:, :-1] = target_tensor_train[:, 1:]
decoder_target_data[:, -1] = 0


In [8]:
model.fit([input_tensor_train, target_tensor_train],
          np.expand_dims(decoder_target_data, -1),
          batch_size=64,
          epochs=5,
          validation_split=0.2)


Epoch 1/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m711s[0m 7s/step - accuracy: 0.8253 - loss: 2.0345 - val_accuracy: 0.8488 - val_loss: 1.0796
Epoch 2/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m752s[0m 7s/step - accuracy: 0.8475 - loss: 1.0490 - val_accuracy: 0.8534 - val_loss: 1.0123
Epoch 3/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m715s[0m 7s/step - accuracy: 0.8507 - loss: 0.9762 - val_accuracy: 0.8570 - val_loss: 0.9647
Epoch 4/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m737s[0m 7s/step - accuracy: 0.8575 - loss: 0.8903 - val_accuracy: 0.8585 - val_loss: 0.9391
Epoch 5/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m739s[0m 7s/step - accuracy: 0.8596 - loss: 0.8323 - val_accuracy: 0.8615 - val_loss: 0.9166


<keras.src.callbacks.history.History at 0x7d8eb7211050>