# Main Library

In [1]:
# Reading data
import pandas as pd
import numpy as np
import os, nltk

# Data Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Building Deep Learning Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Input
from tensorflow.keras.models import Model, load_model
import tensorflow.keras as k

# Reading Data

In [2]:
path = r'D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\Data.txt'
data = pd.read_csv(path, sep='\t', names=['english', 'arabic', 'none'])
data = data.drop(columns='none', axis=1)
data = data.iloc[:4000]
data.head()

Unnamed: 0,english,arabic
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Duck!,اخفض رأسك!
3,Duck!,اخفضي رأسك!
4,Duck!,اخفضوا رؤوسكم!


In [3]:
data.isnull().sum()

english    0
arabic     0
dtype: int64

In [4]:
data.shape

(4000, 2)

# Data Preprocessing

## Ecoder For English Sentence

In [5]:
english_text = data.english.tolist()

token = Tokenizer()
token.fit_on_texts(english_text)
english_seq = token.texts_to_sequences(english_text)

max_eng = max([len(sen) for sen in english_seq])
print('The MAx Length of English Sentence is ---> ', max_eng)

english_pad = pad_sequences(english_seq, padding='post', maxlen=max_eng)
english_pad_array = np.array(english_pad)

print('The Shape of English padding is ---> ', english_pad_array.shape)
english_dict = token.word_index
num_eng_words = len(english_dict) + 1

print("The Number of English Word is --> ",  num_eng_words)

The MAx Length of English Sentence is --->  7
The Shape of English padding is --->  (4000, 7)
The Number of English Word is -->  1643


# Decoding Input Arabic Text

In [6]:
arabic_text = []
for text in data.arabic:
    arabic_text.append('<START>' + text + '<END>')

token = Tokenizer()
token.fit_on_texts(arabic_text)
arabic_seq = token.texts_to_sequences(arabic_text)

max_ar = max([len(sen) for sen in arabic_seq])
print('The MAx Length of English Sentence is ---> ', max_ar)

arabic_pad = pad_sequences(arabic_seq, padding='post', maxlen=max_ar)
arabic_pad_array = np.array(arabic_pad)

print('The Shape of English padding is ---> ', arabic_pad_array.shape)
arabic_dict = token.word_index
num_ar_words = len(arabic_dict) + 1

print("The Number of English Word is --> ",  num_ar_words)

The MAx Length of English Sentence is --->  10
The Shape of English padding is --->  (4000, 10)
The Number of English Word is -->  3996


# Decoding Output Arabic

In [7]:
decoder_output_data = []
for text in arabic_seq:
    decoder_output_data.append(text[1:])
    
decoder_padding = pad_sequences(decoder_output_data, padding='post', maxlen=max_ar)
decoder_output_data = np.array(to_categorical(decoder_padding, num_ar_words))

# Building Encoder Decoder Model

In [8]:
# Building Encoder Layer

encoder_input = Input(shape=(None, ))
encoder_embeding = Embedding(num_eng_words, 256, mask_zero=True)(encoder_input)
encoder_output, state_h, state_c = LSTM(128, return_state=True)(encoder_embeding)
encoder_states = [state_h, state_c]

# Building Decoder Layer
decoder_input = Input(shape=(None, ))
decoder_embeding = Embedding(num_ar_words, 256, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embeding, initial_state=encoder_states)
decoder_dence = Dense(num_ar_words, activation='softmax')
output = decoder_dence(decoder_output)

# Collect Encoder-Decoder
model = Model([encoder_input, decoder_input], output)
model.compile(optimizer=k.optimizers.RMSprop(), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    420608      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    1022976     ['input_2[0][0]']                
                                                                                              

In [9]:
model.fit([english_pad_array,arabic_pad_array], decoder_output_data, epochs=200, batch_size=300)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155

Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x2369380b9c8>

In [14]:
def make_refrece_model():
    encoder_model_refrence = Model(encoder_input, encoder_states)
    
    decoder_state_h = Input(shape=(128, ))
    decoder_state_c = Input(shape=(128, ))
    decoder_inputs_states = [decoder_state_h, decoder_state_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embeding, initial_state=decoder_inputs_states)
    decoder_states = [state_h, state_c]
    
    decoder_outputs = decoder_dence(decoder_outputs)
    decoder_model_refrence = Model([decoder_input] + decoder_inputs_states,
                                   [decoder_outputs] + decoder_states)
    
    return encoder_model_refrence, decoder_model_refrence

In [25]:
def str_to_token(sen):
    words = sen.lower().split()
    token_list = []
    for word in words:
        try:
            token_list.append(english_dict[word])
        except:
            print("THe sentence is not recognize ")
            run()
    return pad_sequences([token_list], maxlen=max_eng, padding="post")

In [26]:
enc_model, dec_model = make_refrece_model()
enc_model.save(r"D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\encoder_model")
enc_model.save(r"D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\decoder_model")





INFO:tensorflow:Assets written to: D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\encoder_model\assets


INFO:tensorflow:Assets written to: D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\encoder_model\assets






INFO:tensorflow:Assets written to: D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\decoder_model\assets


INFO:tensorflow:Assets written to: D:\Courses language programming\LLM - Transformer - NLP\NLP - Complete Course\Data\ara-eng\decoder_model\assets


In [31]:
def run():
    enc_model, dec_model = make_refrece_model()

    for sen in range(english_pad_array.shape[0]):
        states_values = enc_model.predict(str_to_token(input("Enter an English Sentence:  ")))

        empty_target = np.zeros((1, 1))
        empty_target[0, 0] = arabic_dict["start"]
        stopping_condition = False
        decoded_translation = ""
        while not stopping_condition:
            dec_output, h, c = dec_model.predict([empty_target] + states_values)
            sampled_word_index = np.argmax(dec_output[0, -1, :])
            sampled_word = None
            for word, index in arabic_dict.items():
                if sampled_word_index == index:
                    decoded_translation += " " + word
                    sampled_word = word
                if sampled_word == "end" or len(decoded_translation.split()) > max_ar:
                    stopping_condition = True

            empty_target = np.zeros((1, 1))
            empty_target[0, 0] = sampled_word_index
            states_values = [h, c]

        print(decoded_translation[:-3])

In [33]:
run()