# Import Libraries

In [21]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, SimpleRNN, GRU
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from keras.models import load_model

# Load Datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train = pd.read_csv('/content/drive/MyDrive/nlp_project/data_cleaning/preprocessed_train.csv')
train.drop(columns='Unnamed: 0', inplace=True)
train

Unnamed: 0,text,label
0,فين الذكور يا رجالة,LB
1,هه انت تشجع فنادي مش حاضرهم شايلين شامبيون اسي...,LY
2,فكرة حلوة و ممكن رسالة ع الموبايل,EG
3,ياا يالميدان كنت فين من زماان يارب كملها على خ...,EG
4,هما اعلامي التوك شو راحوا فين كله اعادة او مش ...,EG
...,...,...
118175,والله تذكرت يوم مات الحسن التاني كان من اول ال...,MA
118176,ممكن بس حد يدلني هو محماا علي ده ازاي حصل علي ...,EG
118177,ساكت ليه يا مجدي قوول وفضفض,EG
118178,سامعة الصراخ اللي طالع من جوه متخافيش دول صحاب...,EG


In [5]:
test = pd.read_csv('/content/drive/MyDrive/nlp_project/data_cleaning/preprocessed_test.csv')
test.drop(columns='Unnamed: 0', inplace=True)
test

Unnamed: 0,text,label
0,احنا بيقنا الصبح استاذ مجدي يومك بيضحك,EG
1,يا مشحبطيني يا اني,LB
2,زي النهارده السادات كان يشعر بالحرب مع مبارك و...,EG
3,عطاهم عصيير في كاس كبيير,MA
4,ولا ما سافل وحقير الا انتم عايزين الراجل يتسجن...,EG
...,...,...
29540,بعلمك فيه أزمة بعد كام ساعة بيتفقوا شوو هالمسخ...,LB
29541,حاجات ممكن تغيب عنك بسيطة وسهلة معناها لكن باز...,LY
29542,وبعدين ليا هلبة وقت ممشيتش لهون,LY
29543,فنان هايل وكوميدي من الدرجة الاولي وطبيعي جدا ...,EG


# Tokenization

In [7]:
# Convert all entries in the 'text' column to strings
train['text'] = train['text'].astype(str)

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['text'])

# Convert text data to sequences of indices
train_sequences = tokenizer.texts_to_sequences(train['text'])
test_sequences = tokenizer.texts_to_sequences(test['text'])

# Adjust indices to start from 1
for sequences in [train_sequences, test_sequences]:
    for sequence in sequences:
        for i in range(len(sequence)):
            sequence[i] += 1

# saving tokenizer

In [13]:
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

In [14]:
# Load the tokenizer from the JSON file
with open('tokenizer.json', 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()

# Convert JSON string back to tokenizer configuration
tokenizer = tokenizer_from_json(tokenizer_json)

# finding the max length for padding

In [15]:
max_sequence_len = 0
for sentence in train_sequences:
    max_sequence_len = max(len(sentence), max_sequence_len)
print(max_sequence_len)

67


In [16]:
max_sequence_len = 0
for sentence in test_sequences:
    max_sequence_len = max(len(sentence), max_sequence_len)
print(max_sequence_len)

61


In [17]:
max_sequence_length = 67
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Label Encoding

In [19]:
encoder = LabelEncoder()
y_encoded_train = encoder.fit_transform(train['label'])
y_encoded_train = to_categorical(y_encoded_train)
y_encoded_test = encoder.fit_transform(test['label'])
y_encoded_test = to_categorical(y_encoded_test)

In [None]:
y_encoded_train

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

# finding model input

In [20]:
num_words = len(tokenizer.word_index)
num_words

202304

# Building GRU Model

In [None]:
gru_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_words, 128, input_length=max_sequence_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
gru_bi.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', 'precision', 'recall', 'f1_score'])

# Training GRU Model

In [None]:
gru_bi.fit(train_padded, y_encoded_train, epochs=10, batch_size=128,
                    validation_data=(test_padded, y_encoded_test),
                    validation_steps=30)

Epoch 1/10
[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 35ms/step - accuracy: 0.6675 - f1_score: 0.5671 - loss: 0.8760 - precision: 0.7959 - recall: 0.5244 - val_accuracy: 0.8276 - val_f1_score: 0.7959 - val_loss: 0.4955 - val_precision: 0.8699 - val_recall: 0.7906
Epoch 2/10
[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 34ms/step - accuracy: 0.9303 - f1_score: 0.9146 - loss: 0.2192 - precision: 0.9464 - recall: 0.9174 - val_accuracy: 0.8151 - val_f1_score: 0.7725 - val_loss: 0.5581 - val_precision: 0.8432 - val_recall: 0.7995
Epoch 3/10
[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 34ms/step - accuracy: 0.9738 - f1_score: 0.9691 - loss: 0.0892 - precision: 0.9786 - recall: 0.9701 - val_accuracy: 0.8273 - val_f1_score: 0.7843 - val_loss: 0.6041 - val_precision: 0.8452 - val_recall: 0.8146
Epoch 4/10
[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 34ms/step - accuracy: 0.9855 - f1_score: 0.9835 - loss: 0.

  self.gen.throw(typ, value, traceback)


[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 34ms/step - accuracy: 0.9961 - f1_score: 0.9955 - loss: 0.0118 - precision: 0.9969 - recall: 0.9957 - val_accuracy: 0.7943 - val_f1_score: 0.7585 - val_loss: 1.1829 - val_precision: 0.7998 - val_recall: 0.7906
Epoch 10/10
[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 34ms/step - accuracy: 0.9963 - f1_score: 0.9957 - loss: 0.0105 - precision: 0.9972 - recall: 0.9956 - val_accuracy: 0.7977 - val_f1_score: 0.7567 - val_loss: 1.3230 - val_precision: 0.8040 - val_recall: 0.7945


<keras.src.callbacks.history.History at 0x7d9f14387ee0>

# Evaluating GRU Model

In [None]:
evaluation = gru_bi.evaluate(test_padded, y_encoded_test)

print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])
print("Test Precision:", evaluation[2])
print("Test Recall:", evaluation[3])
print("Test F1 Score:", evaluation[4])

[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.7974 - f1_score: 0.7600 - loss: 1.2394 - precision: 0.8042 - recall: 0.7943
Test Loss: 1.2023732662200928
Test Accuracy: 0.8017261624336243
Test Precision: 0.80796217918396
Test Recall: 0.7988830804824829
Test F1 Score: tf.Tensor([0.857251   0.81870186 0.7840827  0.7087182  0.64971346], shape=(5,), dtype=float32)


# Saving GRU Model

In [None]:
gru_bi.save("GRU_bi_tokenizer.h5")

In [None]:
loaded_model = load_model("GRU_bi_tokenizer.h5")

# Testing GRU Model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tnkeeh as tn
import re
from keras.models import load_model

tokenizer = Tokenizer()
train['text'] = train['text'].astype(str)
tokenizer.fit_on_texts(train['text'])

encoder = LabelEncoder()
y_encoded_train = encoder.fit_transform(train['label'])

loaded_model = load_model("/content/drive/MyDrive/nlp_project/dl_models/model_lstm_tokenizer.h5")

def predict_label(text, model, tokenizer, max_sequence_length):

    # data preprocessing
    cleander = tn.Tnkeeh(remove_diacritics=True,
                     remove_html_elements=True,
                     remove_twitter_meta=True,
                     remove_links=True,
                     remove_english=True,
                     remove_repeated_chars=True,
                     remove_long_words=True,
                     normalize=True
                     )

    text = cleander.clean_raw_text(text)
    text = text[0]

    text = text.replace(r'[0-9٠-٩]', '')
    text = text.replace("؟", "")
    text = text.replace("@", "")
    text = text.replace("_", "")
    text = text.replace("-", "")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    arabic_punctuation_pattern = r'[^\w\s\u0621-\u063A\u0641-\u064A]'
    text = re.sub(arabic_punctuation_pattern,'',text)

    text = re.sub(r'\s+', ' ', text).strip()

    # Convert the input text to a sequence of indices
    sequence = tokenizer.texts_to_sequences([text])

    # Adjust indices to start from 1
    for i in range(len(sequence[0])):
        sequence[0][i] += 1

    # Pad the sequence to the maximum length
    padded_sequence = pad_sequences(sequence, maxlen=67, padding='post')

    # Predict the label
    prediction = model.predict(padded_sequence)

    # Get the label with the highest probability
    predicted_label_index = np.argmax(prediction, axis=1)

    # Decode the label back to its original form
    predicted_label = encoder.inverse_transform(predicted_label_index)

    return predicted_label[0]

# Example usage
text_to_predict = ""
predicted_label = predict_label(text_to_predict,loaded_model, tokenizer, encoder)
print("Predicted Label:", predicted_label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350ms/step
Predicted Label: SD


# Building LSTM Model

In [None]:
lstm_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_words, 64, input_length=max_sequence_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
lstm_bi.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', 'precision', 'recall', 'f1_score'])

# Training LSTM Model

In [None]:
history = lstm_bi.fit(train_padded, y_encoded_train, epochs=10, batch_size=64,
                    validation_data=(test_padded, y_encoded_test),
                    validation_steps=30)

Epoch 1/10
[1m1847/1847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 24ms/step - accuracy: 0.6786 - f1_score: 0.5819 - loss: 0.8416 - precision: 0.8089 - recall: 0.5462 - val_accuracy: 0.8125 - val_f1_score: 0.7869 - val_loss: 0.5428 - val_precision: 0.8617 - val_recall: 0.7594
Epoch 2/10
[1m1847/1847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.9261 - f1_score: 0.9078 - loss: 0.2301 - precision: 0.9448 - recall: 0.9110 - val_accuracy: 0.8484 - val_f1_score: 0.8212 - val_loss: 0.4738 - val_precision: 0.8712 - val_recall: 0.8172
Epoch 3/10
[1m1847/1847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.9692 - f1_score: 0.9635 - loss: 0.1007 - precision: 0.9750 - recall: 0.9638 - val_accuracy: 0.8073 - val_f1_score: 0.7698 - val_loss: 0.6513 - val_precision: 0.8321 - val_recall: 0.7875
Epoch 4/10
[1m1847/1847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.9823 - f1_score: 0.9792 - 

# Evaluating LSTM Model

In [None]:
evaluation = lstm_bi.evaluate(test_padded, y_encoded_test)

print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])
print("Test Precision:", evaluation[2])
print("Test Recall:", evaluation[3])
print("Test F1 Score:", evaluation[4])

[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.7940 - f1_score: 0.7566 - loss: 1.2619 - precision: 0.7996 - recall: 0.7915
Test Loss: 1.2180308103561401
Test Accuracy: 0.8011846542358398
Test Precision: 0.8060216903686523
Test Recall: 0.7982738018035889
Test F1 Score: tf.Tensor([0.85552084 0.827953   0.78626895 0.702851   0.6433416 ], shape=(5,), dtype=float32)


# Testing LSTM Model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tnkeeh as tn
import re
from keras.models import load_model

tokenizer = Tokenizer()
train['text'] = train['text'].astype(str)
tokenizer.fit_on_texts(train['text'])

encoder = LabelEncoder()
y_encoded_train = encoder.fit_transform(train['label'])

loaded_model = load_model("/content/drive/MyDrive/nlp_project/dl_models/model_lstm_tokenizer.h5")

def predict_label(text, model, tokenizer, max_sequence_length):

    # data preprocessing
    cleander = tn.Tnkeeh(remove_diacritics=True,
                     remove_html_elements=True,
                     remove_twitter_meta=True,
                     remove_links=True,
                     remove_english=True,
                     remove_repeated_chars=True,
                     remove_long_words=True,
                     normalize=True
                     )

    text = cleander.clean_raw_text(text)
    text = text[0]

    text = text.replace(r'[0-9٠-٩]', '')
    text = text.replace("؟", "")
    text = text.replace("@", "")
    text = text.replace("_", "")
    text = text.replace("-", "")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    arabic_punctuation_pattern = r'[^\w\s\u0621-\u063A\u0641-\u064A]'
    text = re.sub(arabic_punctuation_pattern,'',text)

    text = re.sub(r'\s+', ' ', text).strip()

    # Convert the input text to a sequence of indices
    sequence = tokenizer.texts_to_sequences([text])

    # Adjust indices to start from 1
    for i in range(len(sequence[0])):
        sequence[0][i] += 1

    # Pad the sequence to the maximum length
    padded_sequence = pad_sequences(sequence, maxlen=67, padding='post')

    # Predict the label
    prediction = model.predict(padded_sequence)

    # Get the label with the highest probability
    predicted_label_index = np.argmax(prediction, axis=1)

    # Decode the label back to its original form
    predicted_label = encoder.inverse_transform(predicted_label_index)

    return predicted_label[0]

# Example usage
text_to_predict = "يازول"
predicted_label = predict_label(text_to_predict,loaded_model, tokenizer, encoder)
print("Predicted Label:", predicted_label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350ms/step
Predicted Label: SD
