# Import Libraries

In [23]:
import os
import re

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder


# Load corpus

### Load Positives

In [2]:
train_tweets, train_labels = [], []

pos = os.getcwd() + '/corpus/arabic_tweets/pos/'  # Replace with the actual directory path

# Iterate over each file in the directory
for filename in os.listdir(pos):
    if filename.endswith('.txt'):  # Select only text files
        file_path = os.path.join(pos, filename)
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            file_content = file.read()
            train_tweets.append(file_content)
            train_labels.append("positive")

### Load Negatives

In [3]:
# Get the txt file negative tweet
pos = os.getcwd() + '/corpus/arabic_tweets/neg/'  # Replace with the actual directory path

# Iterate over each file in the directory
for filename in os.listdir(pos):
    if filename.endswith('.txt'):  # Select only text files
        file_path = os.path.join(pos, filename)
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            file_content = file.read()
            train_tweets.append(file_content)
            train_labels.append("negative")

### Build a dataframe

In [10]:
train_dic = {
    'Tweets' : train_tweets,
    'Labels' : train_labels
}

train_corpus = pd.DataFrame(train_dic)
train_corpus.head()

Unnamed: 0,Tweets,Labels
0,نحن الذين يتحول كل ما نود أن نقوله إلى دعاء لل...,positive
1,وفي النهاية لن يبقىٰ معك آحدإلا من رأىٰ الجمال...,positive
2,نمش ننوم ما دا ديل ولادنا 💚\n,positive
3,تعدل النت وشفتها ✌\n,positive
4,"🎥 المهمة الأولى في ""جدة"" ✔💪🏼 💙 #الهلال #فيديو_...",positive


# EDA

##### Explore your dataset

In [5]:
train_corpus.head(10)

Unnamed: 0,Tweets,Labels
0,نحن الذين يتحول كل ما نود أن نقوله إلى دعاء لل...,positive
1,وفي النهاية لن يبقىٰ معك آحدإلا من رأىٰ الجمال...,positive
2,نمش ننوم ما دا ديل ولادنا 💚\n,positive
3,تعدل النت وشفتها ✌\n,positive
4,"🎥 المهمة الأولى في ""جدة"" ✔💪🏼 💙 #الهلال #فيديو_...",positive
5,اللهم صيبآ نافعآ 🌹\n,positive
6,وضع الدوري هالسنه 😁\n,positive
7,بمناسبة فوز الهلال .. 💙 سحب على آيفون XR📱 رتوي...,positive
8,مفيش غيرك انتى و هو و عمالين نلف حوالين بعض 😂\n,positive
9,الله يخليكك مبحبش اكدب انا 😂 😂\n,positive


# Data Preprocessing

### Shuffle all rows

In [6]:
df=train_corpus.sample(frac = 1)
df.head()

Unnamed: 0,Tweets,Labels
57520,مو كل اصابيعك سوه ص(☑)ح ! 😟\n,negative
19240,كأنك مختار ايام دوامي 😏\n,positive
30259,الف مبرووك لكل عشاق الملكي 👏🏻👏🏻💙💙💪🏻 الحمدلله ع...,negative
55921,و صحي من اول منبه\n,negative
56779,"""تلاتين سنة بترقص .. الليلة رقصتنا"" أنا ببكي 😭...",negative


### Data cleaning

**Hint: remove URLs, Hashtags, alphanumeric characters, punctuation marks, stop words, extra spaces**

In [11]:
URL_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
hashtag_pattern = r"#\w+"
mention_pattern = r"@\w+"
alphanumeric_pattern = r"\w*\d\w*"
punctuation_pattern = r"[^\w\s]"
retweet_pattern = r"^RT[\s]+"

In [28]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
    return frozenset(stop_set)

def process_text(text, stop_words):
    # Remove URLs
    text = re.sub(URL_pattern, '', text)
    
    # Remove hashtags
    text = re.sub(hashtag_pattern, '', text)
    
    # Remove mention
    text = re.sub(mention_pattern, '', text)

    # Remove alphanumeric characters
    text = re.sub(alphanumeric_pattern, '', text)

    # Remove punctuation marks
    text = re.sub(punctuation_pattern, '', text)
    
    # Remove Retweet marks
    text = re.sub(retweet_pattern, '', text)

    # Remove stop words using the provided set
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    text = ' '.join(text.split())
    return text

#### Now Clean your text using above function or implement it from scrach

In [14]:
def load_stopwords(Stop_Words):
    with open(Stop_Words, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
    return frozenset(stop_set)

def process_text(text, stop_words):
    
    text = re.sub(URL_pattern, '', text)
    
    
    text = re.sub(hashtag_pattern, '', text)

    
    text = re.sub(mention_pattern, '', text)

    
    text = re.sub(alphanumeric_pattern, '', text)

    
    text = re.sub(punctuation_pattern, '', text)
    
    
    text = re.sub(retweet_pattern, '', text)

    
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    text = ' '.join(text.split())
    return text

In [15]:
stop = load_stopwords('corpus/Stop_Words.txt')

In [16]:
for i in range(train_corpus.shape[0]):
    train_corpus.loc[i,'Tweets'] = process_text(train_corpus.loc[i,'Tweets'],stop)

In [17]:
train_corpus.reset_index(inplace=True,drop=True)
train_corpus

Unnamed: 0,Tweets,Labels
0,نحن يتحول نود أن نقوله إلى دعاء لله تبحثوا فين...,positive
1,النهاية يبقى معك آحدإلا رأى الجمال روحك أماالم...,positive
2,نمش ننوم دا ديل ولادنا,positive
3,تعدل النت وشفتها,positive
4,المهمة الأولى جدة,positive
...,...,...
58746,فقيدتي وأن مرت الأيام وبدأ الجميع بنسيانك لاتق...,negative
58747,مره السنه اسبوع عاد,negative
58748,اسال الله عز وجل الفضيل ينصر جنودنا البواسل وي...,negative
58749,يعني الغاء العقود الاولي تسكيته لنا شسالفه احن...,negative


In [18]:
train_corpus.loc[:,'Tweets']


0        نحن يتحول نود أن نقوله إلى دعاء لله تبحثوا فين...
1        النهاية يبقى معك آحدإلا رأى الجمال روحك أماالم...
2                                   نمش ننوم دا ديل ولادنا
3                                         تعدل النت وشفتها
4                                        المهمة الأولى جدة
                               ...                        
58746    فقيدتي وأن مرت الأيام وبدأ الجميع بنسيانك لاتق...
58747                                  مره السنه اسبوع عاد
58748    اسال الله عز وجل الفضيل ينصر جنودنا البواسل وي...
58749    يعني الغاء العقود الاولي تسكيته لنا شسالفه احن...
58750                                  الفار عهد خليل جلال
Name: Tweets, Length: 58751, dtype: object

#### Extra: you could do stemming or lemmatization before training

# Split data to train and test

In [19]:
split_ratio = 0.8 # 80% for the train
split_index = int(split_ratio * len(train_corpus))
x_train, y_train = train_corpus.loc[:split_index,'Tweets'], train_corpus.loc[:split_index,'Labels']
x_test, y_test = train_corpus.loc[split_index:,'Tweets'], train_corpus.loc[split_index:,'Labels']

# Tokenizer

In [20]:
token = Tokenizer(oov_token='<OOV>')
token.fit_on_texts(x_train)
word_index = token.word_index

# Text to sequence

In [21]:
sequ = token.texts_to_sequences(x_train)


# Pad sequence

In [24]:
max_sequ = max(len(seq) for seq in sequ)
paded = pad_sequences(sequ,maxlen=max_sequ,padding='post')

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# RNN Model

In [25]:
model = Sequential()
model.add(Embedding(input_dim=len(token.word_index) + 1, output_dim=100, input_length=max_sequ))
model.add(SimpleRNN(units=100))
model.add(Dense(units=1, activation='sigmoid'))




In [26]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [27]:
model.fit(paded,y_train, epochs=1 ,batch_size=128)






<keras.src.callbacks.History at 0x1d14ae262d0>

# LSTM Model

In [28]:
mmodel = Sequential()
mmodel.add(Embedding(input_dim=len(token.word_index) + 1, output_dim=100, input_length=max_sequ))
mmodel.add(LSTM(units=100))
mmodel.add(Dense(units=1, activation='sigmoid'))

In [29]:
mmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [30]:
mmodel.fit(paded,y_train, epochs=1 ,batch_size=128)




<keras.src.callbacks.History at 0x1d1568318d0>

# Evaulation and Comparsion

In [31]:
token = Tokenizer(oov_token='<OOV>')
token.fit_on_texts(x_test)
word_index = token.word_index

sequ = token.texts_to_sequences(x_test)

paded = pad_sequences(sequ,maxlen=max_sequ,padding='post')
label_encoder = LabelEncoder()
y_test = label_encoder.fit_transform(y_test)
model.evaluate(x=paded,y=y_test)




[1.179868221282959, 0.26814737915992737]

In [32]:
mmodel.evaluate(paded,y_test)




[2.591974973678589, 0.1557314246892929]