# Import Libraries

In [1]:
import os
import re

import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences




# Load corpus

### Load Positives

In [3]:
train_tweets, train_labels = [], []

pos = os.getcwd() + '/corpus/arabic_tweets/pos/'  # Replace with the actual directory path

# Iterate over each file in the directory
for filename in os.listdir(pos):
    if filename.endswith('.txt'):  # Select only text files
        file_path = os.path.join(pos, filename)
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            file_content = file.read()
            train_tweets.append(file_content)
            train_labels.append("positive")

### Load Negatives

In [4]:
# Get the txt file negative tweet
pos = os.getcwd() + '/corpus/arabic_tweets/neg/'  # Replace with the actual directory path

# Iterate over each file in the directory
for filename in os.listdir(pos):
    if filename.endswith('.txt'):  # Select only text files
        file_path = os.path.join(pos, filename)
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            file_content = file.read()
            train_tweets.append(file_content)
            train_labels.append("negative")

### Build a dataframe

In [5]:
train_dic = {
    'Tweets' : train_tweets,
    'Labels' : train_labels
}

train_corpus = pd.DataFrame(train_dic)
train_corpus.head()

Unnamed: 0,Tweets,Labels
0,نحن الذين يتحول كل ما نود أن نقوله إلى دعاء لل...,positive
1,وفي النهاية لن يبقىٰ معك آحدإلا من رأىٰ الجمال...,positive
2,نمش ننوم ما دا ديل ولادنا 💚\n,positive
3,تعدل النت وشفتها ✌\n,positive
4,"🎥 المهمة الأولى في ""جدة"" ✔💪🏼 💙 #الهلال #فيديو_...",positive


# EDA

##### Explore your dataset

In [6]:
# Write your code ^_^
train_corpus.tail()

Unnamed: 0,Tweets,Labels
58746,#أمي فقيدتي وأن مرت الأيام.. وبدأ الجميع بنسيا...,negative
58747,مره في السنه ما كل اسبوع عاد 😢\n,negative
58748,#يوم_الجمعه اسال الله عز وجل في هذا اليوم الفض...,negative
58749,يعني الغاء العقود الاولي كانت تسكيته لنا شسالف...,negative
58750,الفار 🐀 في عهد خليل جلال 😲\n,negative


In [7]:
train_corpus.describe().T

Unnamed: 0,count,unique,top,freq
Tweets,58751,36723,بمناسبة فوز الهلال .. 💙 سحب على آيفون XR📱 رتوي...,479
Labels,58751,2,positive,29849


In [8]:
train_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58751 entries, 0 to 58750
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweets  58751 non-null  object
 1   Labels  58751 non-null  object
dtypes: object(2)
memory usage: 918.1+ KB


# Data Preprocessing

### Shuffle all rows

In [9]:
# Write your code ^_^
df = train_corpus.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Tweets,Labels
0,صباحك ورد 🌹\n,positive
1,اهاا كنت احسبك اتحادي 😂\n,positive
2,#الهلال_الاهلي اسئل الله لكل من ساعدني برتويت ...,negative
3,قوتنا بعد الله دائما أنتم🌹💙 الف مبرووك 💙 الحمد...,positive
4,جزائريون ضد التطبيع من مسيرات جمعة البارحة معا...,negative
...,...,...
58746,احد يفهمني شصاير هنا 😭\n,negative
58747,ارجووكم اتوجهو للقياده في تمثيليه بتاعت ضرب رص...,negative
58748,يارب فرج همومنا وارح قلوبنا واغفر ذنوبنا واشف ...,positive
58749,#ساعه_استجابه اللهم طمانينه ونورا فب قلوبنا ال...,positive


### Data cleaning

**Hint: remove URLs, Hashtags, alphanumeric characters, punctuation marks, stop words, extra spaces**

In [10]:
URL_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
hashtag_pattern = r"#\w+"
mention_pattern = r"@\w+"
alphanumeric_pattern = r"\w*\d\w*"
punctuation_pattern = r"[^\w\s]"
retweet_pattern = r"^RT[\s]+"

In [11]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
    return frozenset(stop_set)

def process_text(text, stop_words):
    # Remove URLs
    text = re.sub(URL_pattern, '', text)
    
    # Remove hashtags
    text = re.sub(hashtag_pattern, '', text)
    
    # Remove mention
    text = re.sub(mention_pattern, '', text)

    # Remove alphanumeric characters
    text = re.sub(alphanumeric_pattern, '', text)

    # Remove punctuation marks
    text = re.sub(punctuation_pattern, '', text)
    
    # Remove Retweet marks
    text = re.sub(retweet_pattern, '', text)

    # Remove stop words using the provided set
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    text = ' '.join(text.split())
    return text

#### Now Clean your text using above function or implement it from scrach

In [12]:
df.head()

Unnamed: 0,Tweets,Labels
0,صباحك ورد 🌹\n,positive
1,اهاا كنت احسبك اتحادي 😂\n,positive
2,#الهلال_الاهلي اسئل الله لكل من ساعدني برتويت ...,negative
3,قوتنا بعد الله دائما أنتم🌹💙 الف مبرووك 💙 الحمد...,positive
4,جزائريون ضد التطبيع من مسيرات جمعة البارحة معا...,negative


In [13]:
# Write your code ^_^
stop=load_stopwords(R'C:\Users\Admin\Desktop\BootCamp\RNN_Lab1\corpus\Stop_Words.txt')
for i in range(len(df['Tweets'])):
    df['Tweets'][i] = process_text(df['Tweets'][i], stop)
df

Unnamed: 0,Tweets,Labels
0,صباحك ورد,positive
1,اهاا كنت احسبك اتحادي,positive
2,اسئل الله لكل ساعدني برتويت الله يفرجها فرج يت...,negative
3,قوتنا الله دائما أنتم مبرووك الحمد لله الشكر,positive
4,جزائريون التطبيع مسيرات جمعة البارحة معا للتحر...,negative
...,...,...
58746,يفهمني شصاير هنا,negative
58747,ارجووكم اتوجهو للقياده تمثيليه بتاعت ضرب رصاص ...,negative
58748,يارب فرج همومنا وارح قلوبنا واغفر ذنوبنا واشف ...,positive
58749,اللهم طمانينه ونورا فب قلوبنا اللهم ازهر ضفاف ...,positive


#### Extra: you could do stemming or lemmatization before training

# Wrong Split

In [None]:
X = df.drop(columns=['Labels'])
y = df.drop(columns=['Tweets'])

In [None]:
# Write your code ^_^
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [None]:
X_train

# Tokenizer

In [14]:
corpus = df["Tweets"].astype(str).tolist()
len(corpus)

58751

In [15]:
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(corpus) 
word_index = tokenizer.word_index

# Text to sequence

In [16]:
# Write your code ^_^
sequences = tokenizer.texts_to_sequences(corpus)
num_classes = len(tokenizer.word_index) + 1

print("Total number of words: ", num_classes)

Total number of words:  77383


In [17]:
input_sequences = []
labels = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences.append(n_gram_sequence[:-1])
        labels.append(n_gram_sequence[-1])

# Pad sequence

In [18]:
maxSeq = max([len(seq) for seq in input_sequences])
pad_seq = pad_sequences(input_sequences, maxlen=maxSeq,padding='pre')

___

# Split data to train and test

In [19]:
split_ratio = 0.8 
split_index = int(split_ratio * len(input_sequences))
x_train, y_train = input_sequences[:split_index], labels[:split_index]
x_test, y_test = input_sequences[split_index:], labels[split_index:] # 20 for the test

# RNN Model

In [20]:
modelRNN = Sequential()
modelRNN.add(Embedding(input_dim = num_classes, output_dim = 100,input_length=maxSeq))
modelRNN.add(SimpleRNN(100,return_sequences =True))
modelRNN.add(SimpleRNN(100))
modelRNN.add(Dense(units = num_classes, activation = "softmax"))

modelRNN.compile(
    optimizer = "adam",loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)





# LSTM Model

In [21]:
modelLSTM = Sequential()
modelLSTM.add(Embedding(input_dim = num_classes, output_dim = 100,input_length=maxSeq))
modelLSTM.add(SimpleRNN(100,return_sequences =True))
modelLSTM.add(SimpleRNN(100))
modelLSTM.add(Dense(units = num_classes, activation = "softmax"))

modelLSTM.compile(
    optimizer = "adam",loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

In [22]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, tokenizer, sequences, labels, batch_size, max_sequence_length, num_classes):
        self.tokenizer = tokenizer
        self.sequences = sequences
        self.labels = labels
        self.batch_size = batch_size
        self.max_sequence_length = max_sequence_length
        self.num_classes = num_classes

    def __len__(self):
        return len(self.sequences) // self.batch_size

    def __getitem__(self, index):
        batch_indices = np.random.choice(len(self.sequences), size=self.batch_size, replace=False)
        batch_sequences = [self.sequences[i] for i in batch_indices]
        batch_labels = [self.labels[i] for i in batch_indices]
        x = pad_sequences(batch_sequences, maxlen=self.max_sequence_length)
        y = self.one_hot_encode(batch_labels)

        return x, y

    def one_hot_encode(self, labels):
        encoded_labels = np.zeros((len(labels), self.num_classes), dtype=np.float32)
        for i, label in enumerate(labels):
            encoded_labels[i, label] = 1.0
        return encoded_labels

In [23]:
train_data_generator = DataGenerator(tokenizer, x_train, y_train, 64, maxSeq, num_classes)
test_data_generator = DataGenerator(tokenizer, x_test, y_test, 64, maxSeq, num_classes)

# Evaulation and Comparsion

___

- trying to run on my gpu but it does not detect the gpu

In [24]:
import tensorflow as tf

# Check for GPU availability
if tf.test.gpu_device_name():
    print('GPU is available')
else:
    print('GPU is NOT available')

# Explicitly set GPU device
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)


GPU is NOT available


In [25]:
with tf.device("/gpu:0"):
    modelRNN.fit(train_data_generator,epochs=10,batch_size=64)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
with tf.device("/gpu:0"):
    modelLSTM.fit(train_data_generator,epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
import pickle
pickle_out = open('modelRNN1.pkl','wb')
pickle.dump(modelRNN,pickle_out)
pickle_out.close()


In [30]:
pickle_out = open('modelLSTM1.pkl','wb')
pickle.dump(modelLSTM,pickle_out)
pickle_out.close()


In [32]:
from tensorflow import keras

In [33]:
keras.models.save_model(modelRNN, "modelRNN.h5")

In [34]:
keras.models.save_model(modelLSTM, "modelLSTM.h5")

In [27]:
loss, accuracy = modelRNN.evaluate(test_data_generator)
print("Loss:", loss)
print("Accuracy:", accuracy)

Loss: 6.32572078704834
Accuracy: 0.4853692352771759


In [28]:
loss, accuracy = modelLSTM.evaluate(test_data_generator)
print("Loss:", loss)
print("Accuracy:", accuracy)

Loss: 6.315305233001709
Accuracy: 0.4855680763721466


| EVA\Model | RNN | LSTM |
|----------|----------|----------|
| Accuracy | 48.54 | 48.56 |
| Loss | 6.326 | 6.315 |

