<a href="https://colab.research.google.com/github/OtmaneDaoudi/Arabic-sentiment-analysis/blob/main/arabic_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependencies

In [1]:
# !pip install emoji
# !pip install Arabic-Stopwords
# !pip install seaborn
# !pip install matplotlib
# !pip install soyclustering

# Libs imports

In [2]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import gensim
import emoji
import arabicstopwords.arabicstopwords as stp
import tqdm
import os

from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from collections import defaultdict
from math import log

from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

nltk.download('stopwords')

SEED = 21

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing data

In [3]:
if not os.path.exists("preprocessed_data.csv"):
    data = pd.read_csv("./datasets/ASTD/Tweets.txt", header = None, sep = "\t", names = ["tweet", "class"])
else:
    data = pd.read_csv("preprocessed_data.csv", header = 0, names = ["tweet", "class"])
data.head(20)

Unnamed: 0,tweet,class
0,استقالة المحكمة رئيس الدستورية نتظر السود القضاء,OBJ
1,جمال الدين القيادي بمناسبة صدور أولى الدكتور ا...,POS
2,عصام العريان واشنطن بامريكا يستقوى برادعي يرسل...,NEG
3,استقصائي فيلم حرية لأول كواليس حقيقة أسرار وال...,OBJ
4,الفسق الشيشان اقولها بخاطري وكيلك الحن والفجور...,NEUTRAL
5,رئيسى انتخبوا العرص البرص مرسى,NEUTRAL
6,ستريكر صريح يتقال امير كاريوكي السكة عيد فعلا,POS
7,الفتيات شاب ستستمر غلطة حب كبير قصة واحدة احكي...,OBJ
8,اعداد الأربعاء ادعوكم التفاصيل مركز لحضور القا...,OBJ
9,جماعة السابق الحياة الماضي السياسية وضعها شبرا...,NEG


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9683 entries, 0 to 9693
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   9683 non-null   object
 1   class   9683 non-null   object
dtypes: object(2)
memory usage: 226.9+ KB


# Data preprocessing

Our preprocessing pipeline contains the following steps:

1.  Remove duplicat entries
2.  Replacing emojies & emoticons
3.  Remove mentions
4.  Remove Links
5.  Remove whitespaces
6.  Remove punctuations & Special chars
7.  Remove Consecutive characters
8.  Tokenization
9.  Remove foreign words
10. Remove stop words
11. Remove numbers
12. Stemming


## Removing duplicates

In [5]:
count = data.duplicated().sum()
print(f"{(count / data.shape[0]) * 100:.1f}% of the data are duplicats")

2.7% of the data are duplicats


In [6]:
if not os.path.exists("preprocessed_data.csv"):
    data.drop_duplicates(inplace = True)

## Replacing emojies

In [7]:
emojis = {
    "🙂":"يبتسم",
    "😂":"يضحك",
    "🤣" : "يضحك",
    "💔":"قلب حزين",
    "🙂":"يبتسم",
    "❤️":"حب",
    "🥰":"حب",
    "🤐":"سكوت",
    "🧡":"حب",
    "❤":"حب",
    "😍":"حب",
    "😭":"يبكي",
    "🤭":"يبتسم",
    "😢":"حزن",
    "😔":"حزن",
    "♥":"حب",
    "💜":"حب",
    "😅":"يضحك",
    "🙁":"حزين",
    "💕":"حب",
    "💙":"حب",
    "😞":"حزين",
    "😊":"سعادة",
    "👏":"يصفق",
    "👌":"احسنت",
    "😴":"ينام",
    "😀":"يضحك",
    "✅":"صحيح",
    "🤪":"يضحك",
    "🏡" : "بيت",
    "🤲" : "دعاء",
    "💰" : "مال",
    "😌":"حزين",
    "🎁":"هدية",
    "🌹":"وردة",
    "🥀":"وردة",
    "📿":"وردة",
    "✍":"كتابة",
    "🙈":"حب",
    "😄":"يضحك",
    "😐":"محايد",
    "✌":"منتصر",
    "✨":"نجمه",
    "🤔":"تفكير",
    "😏":"يستهزء",
    "😒":"يستهزء",
    "🙄":"ملل",
    "😕":"عصبية",
    "😃":"يضحك",
    "🌸":"وردة",
    "😓":"حزن",
    "💞":"حب",
    "💗":"حب",
    "😑":"منزعج",
    "💭":"تفكير",
    "😎":"ثقة",
    "💛":"حب",
    "😩":"حزين",
    "🥺":"حزين",
    "💪":"عضلات",
    "👍":"موافق",
    "🙏🏻":"رجاء طلب",
    "😳":"مصدوم",
    "👏🏼":"تصفيق",
    "🎶":"موسيقي",
    "🌚":"صمت",
    "💚":"حب",
    "🙏":"رجاء طلب",
    "💘":"حب",
    "🍃":"سلام",
    "☺":"يضحك",
    "🎊":"يهنئ",
    "💥":"إنفجار",
    "😝":"يسخر",
    "💯":"تمام",
    "🐸":"ضفدع",
    "🤦‍♂️":"غبي",
    "🤩":"معجب",
    "🤤":"جائع",
    "😶":"مصدوم",
    "✌️":"مرح",
    "✋🏻":"توقف",
    "😉":"غمزة",
    "🌷":"حب",
    "🙃":"مبتسم",
    "😫":"حزين",
    "😨":"مصدوم",
    "🎼 ":"موسيقي",
    "🍁":"مرح",
    "🍂":"مرح",
    "💟":"حب",
    "😪":"حزن",
    "😆":"يضحك",
    "😣":"استياء",
    "☺️":"حب",
    "😱":"كارثة",
    "😁":"يضحك",
    "😖":"استياء",
    "🏃🏼":"يجري",
    "😡":"غضب",
    "🚶":"يسير",
    "🤕":"مرض",
    "🤮" : "يتقيئ",
    "⛔": "حذر",
    "‼️":"تعجب",
    "🕊":"طائر",
    "👌🏻":"احسنت",
    "❣":"حب",
    "🙊":"مصدوم",
    "💃":"سعادة مرح",
    "💃🏼":"سعادة مرح",
    "😜":"مرح",
    "👊":"ضربة",
    "😟":"استياء",
    "💖":"حب",
    "😥":"حزن",
    "🎻":"موسيقي",
    "✒":"يكتب",
    "🚶🏻":"يسير",
    "💎":"الماظ",
    "😷":"وباء مرض",
    "☝":"واحد",
    "🚬":"تدخين",
    "💐" : "ورد",
    "🌻" : "ورد",
    "🌞" : "شمس",
    "👆" : "الاول",
    "⚠️" :"تحذير",
    "🤗" : "احتواء",
    "✖️": "غلط",
    "📍"  : "مكان",
    "👸" : "ملكه",
    "👑" : "تاج",
    "✔️" : "صح",
    "💌": "قلب",
    "😲" : "مندهش",
    "💦": "ماء",
    "🚫" : "خطا",
    "👏🏻" : "برافو",
    "🏊" :"يسبح",
    "👍🏻": "تمام",
    "⭕️" :"دائره كبيره",
    "🎷" : "ساكسفون",
    "👋": "تلويح باليد",
    "✌🏼": "علامه النصر",
    "🌝":"مبتسم",
    "➿"  : "عقده مزدوجه",
    "💪🏼" : "قوي",
    "📩":  "تواصل معي",
    "☕️": "قهوه",
    "😧" : "قلق و صدمة",
    "🗨": "رسالة",
    "❗️" :"تعجب",
    "🙆🏻": "اشاره موافقه",
    "👯" :"اخوات",
    "©" :  "رمز",
    "👵🏽" :"سيده عجوزه",
    "🐣": "كتكوت",
    "🙌": "تشجيع",
    "🙇": "شخص ينحني",
    "👐🏽":"ايدي مفتوحه",
    "👌🏽": "بالظبط",
    "⁉️" : "استنكار",
    "⚽️": "كوره",
    "🕶" :"حب",
    "🎈" :"بالون",
    "🎀":    "ورده",
    "💵":  "فلوس",
    "😋":  "جائع",
    "😛":  "يغيظ",
    "😠":  "غاضب",
    "✍🏻":  "يكتب",
    "🌾":  "ارز",
    "👣":  "اثر قدمين",
    "❌":"رفض",
    "🍟":"طعام",
    "👬":"صداقة",
    "🐰":"ارنب",
    "🦋" : "فراشة",
    "☂":"مطر",
    "⚜":"مملكة فرنسا",
    "🐑":"خروف",
    "🗣":"صوت مرتفع",
    "👌🏼":"احسنت",
    "☘":"مرح",
    "😮":"صدمة",
    "😦":"قلق",
    "⭕":"الحق",
    "✏️":"قلم",
    "ℹ":"معلومات",
    "🙍🏻":"رفض",
    "⚪️":"نضارة نقاء",
    "🐤":"حزن",
    "💫":"مرح",
    "💝":"حب",
    "🍔":"طعام",
    "❤︎":"حب",
    "✈️":"سفر",
    "🏃🏻‍♀️":"يسير",
    "🍳":"ذكر",
    "🎤":"مايك غناء",
    "🎾":"كره",
    "🐔":"دجاجة",
    "🙋":"سؤال",
    "📮":"بحر",
    "💉":"دواء",
    "🙏🏼":"رجاء طلب",
    "💂🏿 ":"حارس",
    "🎬":"سينما",
    "♦️":"مرح",
    "💡":"قكرة",
    "‼":"تعجب",
    "👼":"طفل",
    "🔑":"مفتاح",
    "♥️":"حب",
    "🌲" : "شجرة",
    "🌳" : "شجرة",
    "🚩" : "حذر",
    "🚨" : "حذر",
    "🛑" : "حذر",
    "🕋":"كعبة",
    "🐓":"دجاجة",
    "💩":"معترض",
    "👽":"فضائي",
    "☔️":"مطر",
    "🍷":"عصير",
    "🌟":"نجمة",
    "☁️":"سحب",
    "👃":"معترض",
    "🌺":"مرح",
    "🔪":"سكينة",
    "♨":"سخونية",
    "👊🏼":"ضرب",
    "✏":"قلم",
    "🚶🏾‍♀️":"يسير",
    "👊":"ضربة",
    "◾️":"وقف",
    "😚":"حب",
    "🔸":"مرح",
    "👎🏻":"لا يعجبني",
    "👊🏽":"ضربة",
    "😙":"حب",
    "🎥":"تصوير",
    "👉":"جذب انتباه",
    "👏🏽":"يصفق",
    "💪🏻":"عضلات",
    "🏴":"اسود",
    "🔥":"حريق",
    "😬":"عدم الراحة",
    "👊🏿":"يضرب",
    "📚" : "كتب",
    "📌" : "علق",
    "🌿":"ورقه شجره",
    "✋🏼":"كف ايد",
    "👐":"ايدي مفتوحه",
    "☠️":"وجه مرعب",
    "🎉":"يهنئ",
    "🔕" :"صامت",
    "😿":"وجه حزين",
    "☹️":"وجه يائس",
    "😘" :"حب",
    "😰" :"خوف و حزن",
    "🌼":"ورده",
    "💋": "بوسه",
    "👇":"لاسفل",
    "❣️":"حب",
    "🎧":"سماعات",
    "📝":"يكتب",
    "😇":"دايخ",
    "😈":"رعب",
    "🏃":"يجري",
    "✌🏻":"علامه النصر",
    "🔫":"يضرب",
    "❗️":"تعجب",
    "👎":"غير موافق",
    "🔐":"قفل",
    "👈":"لليمين",
    "™":"رمز",
    "🚶🏽":"يتمشي",
    "😯":"متفاجأ",
    "✊":"يد مغلقه",
    "😻":"اعجاب",
    "🙉" :"قرد",
    "👧":"طفله صغيره",
    "🔴":"دائره حمراء",
    "💪🏽":"قوه",
    "💤":"ينام",
    "👀":"ينظر",
    "✍🏻":"يكتب",
    "❄️":"تلج",
    "💀":"رعب",
    "😤":"وجه عابس",
    "🖋":"قلم",
    "🎩":"كاب",
    "☕️":"قهوه",
    "😹":"ضحك",
    "💓":"حب",
    "☄️":"نار",
    "👻":"رعب",
    "✋": "يد",
    "🌱": "نبتة",

    # Emoticons
    ":)" : "يبتسم",
    "(:" : "يبتسم",
    ":(" : "حزين",
    "xD" : "يضحك",
    ":=(": "يبكي",
    ":'(": "حزن",
    ":'‑(": "حزن",
    "XD" : "يضحك",
    ":D" : "يبتسم",
    "♬" : "موسيقي",
    "♡" : "حب",
    "☻"  : "يبتسم",
}

def replace_emojis(text):
    pattern = re.compile('|'.join(re.escape(key) for key in emojis.keys()))
    replaced_text = pattern.sub(lambda match: emojis[match.group(0)] + ' ', text)
    return emoji.replace_emoji(replaced_text, '')

if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: replace_emojis(document))

## Removing mentions

In [8]:
pattern = r'@[\w]+'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, '', document))

## Removing links

In [9]:
pattern = r'https?://\S+'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, '', document))

## Removing whitespaces
In this step we get rid of extra whitespaces as well as new lines

In [10]:
pattern = r'\s+|\n+|_'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))

## Remove foriegn words

The text includes english, japanese and words for other languages

In [11]:
pattern = r'[a-zA-Z]+'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, '', document))

## Remove punctuations & special chars

In [12]:
pattern = r'[^\w\s\u0600-\u06FF]+|ﷺ|۩|⓵|؟|؛|۞|ﷻ|ö'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, '', document))

## Remove consecutive characters

In [13]:
pattern = r'(.)\1+'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, r'\1', document))

## Removing numbers

In [14]:
pattern = r'\d+'
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, '', document))

## Tokenization

In [15]:
tokenizer = TweetTokenizer()
if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: tokenizer.tokenize(document))
    data["tweet"] = data["tweet"].apply(lambda document: " ".join(document).strip())

In [16]:
data.tail(3)

Unnamed: 0,tweet,class
9691,الحج إجراء لأداء تقدموا مواطن شروق القرعة المق...,OBJ
9692,صحتها النوم تقولها علشان هتحبك بحبك,POS
9693,حسبته الامتحان غلط كتبته شى,NEG


## Stemming


In [17]:
stemmer = ISRIStemmer()
if not os.path.exists("preprocessed_data.csv"):
    with tqdm.tqdm(range(data.shape[0])) as progress:
        for i in progress:
            data.iloc[i, 0] = stemmer.stem(data.iloc[i, 0])

## Removing stop words

In [18]:
arabic_stopwords = stopwords.words('arabic')
arabic_stopwords.extend(stp.stopwords_list())
stop_words = {entry for entry in arabic_stopwords}
stopwords_stemmer = ISRIStemmer()
with open("arabic_stopwords.txt", "r", encoding="UTF-8") as file:
    for word in file:
        pass
        stop_words.add(stopwords_stemmer.stem(word.strip()))

In [19]:
def remove_stop_words(tweet: str) -> str:
    words = set(tweet.split(sep = ' '))
    return " ".join(list((words - stop_words)))

if not os.path.exists("preprocessed_data.csv"):
    data["tweet"] = data["tweet"].apply(lambda document: remove_stop_words(document))

## Save preprocessed data

In [20]:
if not os.path.exists("preprocessed_data.csv"):
    # remove empty entries
    data.replace('', pd.NA, inplace=True)  # Replace empty strings with NA
    data.dropna(inplace=True)  # Drop rows with NA values
    data.to_csv("preprocessed_data.csv") # inspect the resulting file to validate the preprocessing

# Text representation

## BoW (Bag-of-Words)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = SEED, stratify = data["class"])

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

X_train[0].shape

(29706,)

### Performance evaluation
#### Naive bayes

In [22]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 53.74%
Precesion : 53.74%
Recall : 53.74%
F1 score : 53.74%


#### Logistic regression

In [25]:
model = LogisticRegression(random_state = SEED, max_iter = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 69.39%
Precesion : 69.39%
Recall : 69.39%
F1 score : 69.39%


#### SVM

We were unable to train this model on our machines using the initial dataset, due to the **curse of dimensionality**, so we added dimensioanlity reduction

In [31]:
scaler = StandardScaler()
pca = PCA(n_components = 5000)
svm = SVC(random_state = SEED)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
X_train_SVM = pca.fit_transform(scaled_X_train)
X_test_SVM  = pca.transform(scaled_X_test)

svm.fit(X_train_SVM, y_train)
y_pred = svm.predict(X_test_SVM)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 66.96%
Precesion : 66.96%
Recall : 66.96%
F1 score : 66.96%


#### Random forest

In [None]:
rf = RandomForestClassifier(random_state = SEED)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

## TF-IDF

In [35]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = SEED, stratify= data["class"])

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

X_train[0].shape

(29706,)

### Performance evaluation
#### Naive bayes

In [37]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 53.95%
Precesion : 53.95%
Recall : 53.95%
F1 score : 53.95%


#### Logistic regression

In [39]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 68.20%
Precesion : 68.20%
Recall : 68.20%
F1 score : 68.20%


#### SVM

In [40]:
scaler = StandardScaler()
pca = PCA(n_components = 5000)
svm = SVC(random_state = SEED)

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
X_train_SVM = pca.fit_transform(scaled_X_train)
X_test_SVM  = pca.transform(scaled_X_test)

svm.fit(X_train_SVM, y_train)
y_pred = svm.predict(X_test_SVM)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 66.96%
Precesion : 66.96%
Recall : 66.96%
F1 score : 66.96%


#### Random Forest

In [41]:
rf = RandomForestClassifier(random_state = SEED)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 68.87%
Precesion : 68.87%
Recall : 68.87%
F1 score : 68.87%


## LDA

In [42]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = SEED, stratify = data["class"])

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

lda = LatentDirichletAllocation(n_components = 170, random_state = SEED)
lda.fit(X_train)
X_train = lda.transform(X_train)
X_test = lda.transform(X_test)

### Performance evaluation
#### Naive bayes

In [47]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 14.82%
Precesion : 26.71%
Recall : 26.32%
F1 score : 13.17%


#### Logistic regression

In [49]:
model = LogisticRegression(random_state = SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 66.70%
Precesion : 66.70%
Recall : 66.70%
F1 score : 66.70%


#### SVM

In [51]:
svm = SVC(random_state = SEED)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average="micro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 66.70%
Precesion : 66.70%
Recall : 66.70%
F1 score : 66.70%


#### Random Forest

In [53]:
rf = RandomForestClassifier(random_state = SEED)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 65.93%
Precesion : 65.93%
Recall : 65.93%
F1 score : 65.93%


## LSA

In [54]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = SEED, stratify = data["class"])

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

lsa = TruncatedSVD(n_components = 200, random_state = SEED)
X_train = lsa.fit_transform(X_train)
X_test = lsa.transform(X_test)

### Performance evaluation
#### Naive bayes

In [55]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 29.27%
Precesion : 29.27%
Recall : 29.27%
F1 score : 29.27%


#### Logistic regression

In [57]:
model = LogisticRegression(random_state = SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 68.25%
Precesion : 68.25%
Recall : 68.25%
F1 score : 68.25%


#### SVM

In [59]:
svm = SVC(random_state = SEED)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 68.40%
Precesion : 68.40%
Recall : 68.40%
F1 score : 68.40%


#### Random Forest

In [61]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 67.79%
Precesion : 67.79%
Recall : 67.79%
F1 score : 67.79%


## BoC (Bag-of-concepts)

The BOC (Bag-of-Concepts) method has been proposed as a solution to the problem of large dimensions and sparsity that traditional methods such as TF-IDF and Bag of words suffer from.

### Word embeddings
This is done using the AraVec model which is trained on arabic tweets

In [62]:
model = gensim.models.Word2Vec.load("./aravec/full_uni_cbow_100_twitter/full_uni_cbow_100_twitter.mdl")
word_vecs = {}
total, skipped = 0, 0
for tweet in data["tweet"]:
    for word in tweet.split(" "):
        total += 1
        try:
            word_vecs[word] = model.wv[word]
        except Exception:
            skipped += 1
print(f"total skipped : {skipped} ({(skipped / total) * 100 :.2f}%)")

total skipped : 29925 (28.70%)


### Clustering the words embeddings

In [63]:
NUM_CONCEPTS = 130
model = KMeans(n_clusters = NUM_CONCEPTS, random_state = SEED)
X = list(word_vecs.values())
model.fit(X)
concepts = model.predict(X)
print(len(X))

  super()._check_params_vs_input(X, default_n_init=10)


23290


### Concept extraction
Now that we related words to concepts, we can create a document representataion, in which we express the degree of which a document contains a certain concept, and instead of taking only the freuqncies, we consider an approach similar to TF-IDF called, CF-IDF.

CF-IDF is defined using the following formula : 

![image.png](attachment:image.png)


such that : 

    * |D| is the number of documents in the corpus
    * n_c is the number of occurences of concept c in document d
    * n_k is the total number of concepts in this document 

In [64]:
NUM_DOCS = data.shape[0]

# construct a word to concept mapping
word_concept = {}
for index, word in enumerate(word_vecs.keys()):
    word_concept[word] = concepts[index]
print(word_concept)

{'رئيس': 39, 'نتظر': 0, 'السود': 31, 'القضاء': 73, 'جمال': 101, 'الدين': 15, 'القيادي': 39, 'صدور': 57, 'الدكتور': 34, 'اهنئ': 57, 'بحزب': 39, 'روايته': 37, 'مصر': 22, 'عصام': 49, 'العريان': 49, 'واشنطن': 48, 'بامريكا': 22, 'برادعي': 127, 'يرسل': 119, 'مقرف': 109, 'شئ': 33, 'استقصائي': 57, 'فيلم': 99, 'كواليس': 76, 'يتناول': 32, 'شاهد': 35, 'تعرض': 47, 'الفسق': 15, 'الشيشان': 22, 'اقولها': 108, 'بخاطري': 9, 'وكيلك': 58, 'الحن': 42, 'والفجور': 106, 'تضحك': 12, 'اله': 123, 'تعطيني': 53, 'ليه': 65, 'يانبع': 86, 'مل': 42, 'انتخبوا': 127, 'العرص': 49, 'البرص': 127, 'صريح': 109, 'يتقال': 7, 'امير': 62, 'كاريوكي': 101, 'عيد': 76, 'فعلا': 33, 'الفتيات': 120, 'شاب': 3, 'ستستمر': 47, 'حب': 128, 'كبير': 109, 'احكي': 108, 'حدثت': 96, 'اعداد': 71, 'ادعوكم': 60, 'التفاصيل': 92, 'مركز': 77, 'لحضور': 76, 'السابق': 91, 'الماضي': 28, 'وضعها': 37, 'شبرا': 93, 'جرائم': 46, 'نتقدم': 66, 'تجعلنا': 19, 'العراق': 22, 'ليب': 42, 'اليمن': 22, 'الكويت': 22, 'قطر': 22, 'سوريا': 22, 'عمان': 22, 'علاج': 114, 'نتشور

In [65]:
# construct a concept to document count mapping
concept_docs = defaultdict(int)
for doc in data["tweet"]:
    doc_concepts = set()
    for word in doc.split(" "):
        try:
            doc_concepts.add(word_concept[word])
        except Exception:
            pass
    for concept in doc_concepts:
        concept_docs[concept] += 1
print(concept_docs)

defaultdict(<class 'int'>, {0: 2917, 73: 1058, 31: 519, 39: 988, 34: 510, 101: 859, 37: 333, 15: 619, 22: 1679, 57: 1148, 33: 773, 109: 624, 48: 513, 49: 2366, 119: 431, 127: 685, 32: 583, 99: 1170, 35: 391, 76: 656, 47: 435, 65: 1091, 9: 151, 42: 571, 106: 382, 108: 374, 12: 225, 53: 334, 86: 470, 58: 371, 123: 671, 7: 328, 62: 1305, 128: 855, 96: 299, 3: 827, 120: 593, 71: 1029, 77: 607, 92: 471, 60: 369, 66: 365, 46: 1454, 19: 86, 91: 947, 28: 439, 93: 887, 114: 328, 97: 219, 59: 412, 55: 399, 25: 477, 27: 346, 126: 694, 129: 641, 118: 8, 121: 197, 4: 545, 95: 310, 38: 386, 51: 920, 17: 414, 20: 463, 89: 606, 110: 385, 113: 433, 124: 226, 30: 693, 100: 333, 78: 855, 14: 217, 112: 1283, 6: 678, 2: 732, 18: 300, 45: 222, 116: 422, 69: 169, 84: 270, 125: 363, 104: 247, 115: 405, 11: 481, 41: 209, 75: 458, 26: 431, 83: 334, 74: 249, 72: 371, 52: 561, 54: 399, 94: 333, 122: 253, 80: 416, 67: 361, 40: 613, 85: 275, 29: 174, 90: 249, 10: 427, 70: 182, 79: 230, 44: 211, 24: 161, 61: 316, 21

In [66]:
def cf_idf(document: str):
    """ Returns the CD-IDF representataion of a document """
    res = [0 for _ in range(NUM_CONCEPTS)]
    concepts_counts = defaultdict(int)
    for word in document.split(" "):
        try:
            concepts_counts[word_concept[word]] += 1
        except:
            pass
    n_k = sum(concepts_counts.values()) # number of concepts present in the document (duplicates are considered!)
    for concept in range(NUM_CONCEPTS):
        if concepts_counts[concept] != 0:
            res[concept] = (concepts_counts[concept] / n_k) * log(NUM_DOCS / (1 + concept_docs[concept]))
    return res

In [67]:
X = [cf_idf(tweet) for tweet in data["tweet"]]
y = data["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = SEED, stratify = y)

### Performance evaluation
#### Naive bayes

In [69]:
model = GaussianNB()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 22.41%
Precesion : 22.41%
Recall : 22.41%
F1 score : 22.41%


#### Logistic regression

In [71]:
model = LogisticRegression(random_state = SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 65.62%
Precesion : 65.62%
Recall : 65.62%
F1 score : 65.62%


#### SVM

In [73]:
model = SVC(random_state = SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 67.06%
Precesion : 67.06%
Recall : 67.06%
F1 score : 67.06%


#### Random Forest

In [75]:
rf = RandomForestClassifier(random_state = SEED)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 67.42%
Precesion : 67.42%
Recall : 67.42%
F1 score : 67.42%
