<a href="https://colab.research.google.com/github/OtmaneDaoudi/Arabic-sentiment-analysis/blob/main/arabic_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependencies

In [4]:
# !pip install emoji
# !pip install Arabic-Stopwords
# !pip install seaborn
# !pip install matplotlib
# !pip install soyclustering

# Libs imports

In [5]:
import os
import re
import nltk
import emoji
import pickle
import openpyxl

import arabicstopwords.arabicstopwords as stp
import pandas as pd
import pyarabic.araby as araby
import numpy as np 

from nltk.corpus import stopwords
from snowballstemmer import stemmer

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline

from typing import List

nltk.download('stopwords')

SEED = 21

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing data

In [6]:
data = pd.read_csv("./datasets/ASTC/data.tsv", header = 0, sep='\t', names = ["class", "tweet"]).sample(frac = 1, random_state = SEED)
data.head(20)

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,⛔ طريق #الدمام × #الخبر ابعدواا عنه 📣
37264,pos,احلى صباح 😂
35690,pos,مع بعض الأصدقاء 💕 شاطئ فلامنغو ، أروبا
44109,pos,ايوه صح بس ماتاخذني لمكان بعيد زيه 😂
24380,pos,صباحك خير وبركة نجاة 🌺 💗 🌹
23661,pos,تحية إلي أهل #ليبيا الكل وياعن دين زك أم اللي ...
39242,pos,ماعرفتك وانت لبناني 😅
42630,pos,سبحان الله الحمد لله لاإله الاالله الله أكبر ل...


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45274 entries, 33372 to 15305
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   45274 non-null  object
 1   tweet   45274 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


# Data preprocessing

Our preprocessing pipeline contains the following steps:

1.  Remove duplicat entries
2.  Replacing emojies & emoticons
3.  Remove mentions
4.  Remove Links
5.  Remove whitespaces
6.  Remove punctuations & Special chars
7.  Remove Consecutive characters
8.  Tokenization
9.  Remove foreign words
10. Remove stop words
11. Remove numbers
12. Stemming


## Removing duplicates

In [8]:
count = data.duplicated().sum()
print(f"{(count / data.shape[0]) * 100:.1f}% of the data are duplicats")

34.7% of the data are duplicats


In [9]:
data.drop_duplicates(inplace = True)

## Replacing emojies

In [10]:
emojis = {
    "🙂":"يبتسم",
    "😂":"يضحك",
    "🤣" : "يضحك",
    "💔":"قلب حزين",
    "🙂":"يبتسم",
    "❤️":"حب",
    "🥰":"حب",
    "🤐":"سكوت",
    "🧡":"حب",
    "❤":"حب",
    "😍":"حب",
    "😭":"يبكي",
    "🤭":"يبتسم",
    "😢":"حزن",
    "😔":"حزن",
    "♥":"حب",
    "💜":"حب",
    "😅":"يضحك",
    "🙁":"حزين",
    "💕":"حب",
    "💙":"حب",
    "😞":"حزين",
    "😊":"سعادة",
    "👏":"يصفق",
    "👌":"احسنت",
    "😴":"ينام",
    "😀":"يضحك",
    "✅":"صحيح",
    "🤪":"يضحك",
    "🏡" : "بيت",
    "🤲" : "دعاء",
    "💰" : "مال",
    "😌":"حزين",
    "🎁":"هدية",
    "🌹":"وردة",
    "🥀":"وردة",
    "📿":"وردة",
    "✍":"كتابة",
    "🙈":"حب",
    "😄":"يضحك",
    "😐":"محايد",
    "✌":"منتصر",
    "✨":"نجمه",
    "🤔":"تفكير",
    "😏":"يستهزء",
    "😒":"يستهزء",
    "🙄":"ملل",
    "😕":"عصبية",
    "😃":"يضحك",
    "🌸":"وردة",
    "😓":"حزن",
    "💞":"حب",
    "💗":"حب",
    "😑":"منزعج",
    "💭":"تفكير",
    "😎":"ثقة",
    "💛":"حب",
    "😩":"حزين",
    "🥺":"حزين",
    "💪":"عضلات",
    "👍":"موافق",
    "🙏🏻":"رجاء طلب",
    "😳":"مصدوم",
    "👏🏼":"تصفيق",
    "🎶":"موسيقي",
    "🌚":"صمت",
    "💚":"حب",
    "🙏":"رجاء طلب",
    "💘":"حب",
    "🍃":"سلام",
    "☺":"يضحك",
    "🎊":"يهنئ",
    "💥":"إنفجار",
    "😝":"يسخر",
    "💯":"تمام",
    "🐸":"ضفدع",
    "🤦‍♂️":"غبي",
    "🤩":"معجب",
    "🤤":"جائع",
    "😶":"مصدوم",
    "✌️":"مرح",
    "✋🏻":"توقف",
    "😉":"غمزة",
    "🌷":"حب",
    "🙃":"مبتسم",
    "😫":"حزين",
    "😨":"مصدوم",
    "🎼 ":"موسيقي",
    "🍁":"مرح",
    "🍂":"مرح",
    "💟":"حب",
    "😪":"حزن",
    "😆":"يضحك",
    "😣":"استياء",
    "☺️":"حب",
    "😱":"كارثة",
    "😁":"يضحك",
    "😖":"استياء",
    "🏃🏼":"يجري",
    "😡":"غضب",
    "🚶":"يسير",
    "🤕":"مرض",
    "🤮" : "يتقيئ",
    "⛔": "حذر",
    "‼️":"تعجب",
    "🕊":"طائر",
    "👌🏻":"احسنت",
    "❣":"حب",
    "🙊":"مصدوم",
    "💃":"سعادة مرح",
    "💃🏼":"سعادة مرح",
    "😜":"مرح",
    "👊":"ضربة",
    "😟":"استياء",
    "💖":"حب",
    "😥":"حزن",
    "🎻":"موسيقي",
    "✒":"يكتب",
    "🚶🏻":"يسير",
    "💎":"الماظ",
    "😷":"وباء مرض",
    "☝":"واحد",
    "🚬":"تدخين",
    "💐" : "ورد",
    "🌻" : "ورد",
    "🌞" : "شمس",
    "👆" : "الاول",
    "⚠️" :"تحذير",
    "🤗" : "احتواء",
    "✖️": "غلط",
    "📍"  : "مكان",
    "👸" : "ملكه",
    "👑" : "تاج",
    "✔️" : "صح",
    "💌": "قلب",
    "😲" : "مندهش",
    "💦": "ماء",
    "🚫" : "خطا",
    "👏🏻" : "برافو",
    "🏊" :"يسبح",
    "👍🏻": "تمام",
    "⭕️" :"دائره كبيره",
    "🎷" : "ساكسفون",
    "👋": "تلويح باليد",
    "✌🏼": "علامه النصر",
    "🌝":"مبتسم",
    "➿"  : "عقده مزدوجه",
    "💪🏼" : "قوي",
    "📩":  "تواصل معي",
    "☕️": "قهوه",
    "😧" : "قلق و صدمة",
    "🗨": "رسالة",
    "❗️" :"تعجب",
    "🙆🏻": "اشاره موافقه",
    "👯" :"اخوات",
    "©" :  "رمز",
    "👵🏽" :"سيده عجوزه",
    "🐣": "كتكوت",
    "🙌": "تشجيع",
    "🙇": "شخص ينحني",
    "👐🏽":"ايدي مفتوحه",
    "👌🏽": "بالظبط",
    "⁉️" : "استنكار",
    "⚽️": "كوره",
    "🕶" :"حب",
    "🎈" :"بالون",
    "🎀":    "ورده",
    "💵":  "فلوس",
    "😋":  "جائع",
    "😛":  "يغيظ",
    "😠":  "غاضب",
    "✍🏻":  "يكتب",
    "🌾":  "ارز",
    "👣":  "اثر قدمين",
    "❌":"رفض",
    "🍟":"طعام",
    "👬":"صداقة",
    "🐰":"ارنب",
    "🦋" : "فراشة",
    "☂":"مطر",
    "⚜":"مملكة فرنسا",
    "🐑":"خروف",
    "🗣":"صوت مرتفع",
    "👌🏼":"احسنت",
    "☘":"مرح",
    "😮":"صدمة",
    "😦":"قلق",
    "⭕":"الحق",
    "✏️":"قلم",
    "ℹ":"معلومات",
    "🙍🏻":"رفض",
    "⚪️":"نضارة نقاء",
    "🐤":"حزن",
    "💫":"مرح",
    "💝":"حب",
    "🍔":"طعام",
    "❤︎":"حب",
    "✈️":"سفر",
    "🏃🏻‍♀️":"يسير",
    "🍳":"ذكر",
    "🎤":"مايك غناء",
    "🎾":"كره",
    "🐔":"دجاجة",
    "🙋":"سؤال",
    "📮":"بحر",
    "💉":"دواء",
    "🙏🏼":"رجاء طلب",
    "💂🏿 ":"حارس",
    "🎬":"سينما",
    "♦️":"مرح",
    "💡":"قكرة",
    "‼":"تعجب",
    "👼":"طفل",
    "🔑":"مفتاح",
    "♥️":"حب",
    "🌲" : "شجرة",
    "🌳" : "شجرة",
    "🚩" : "حذر",
    "🚨" : "حذر",
    "🛑" : "حذر",
    "🕋":"كعبة",
    "🐓":"دجاجة",
    "💩":"معترض",
    "👽":"فضائي",
    "☔️":"مطر",
    "🍷":"عصير",
    "🌟":"نجمة",
    "☁️":"سحب",
    "👃":"معترض",
    "🌺":"مرح",
    "🔪":"سكينة",
    "♨":"سخونية",
    "👊🏼":"ضرب",
    "✏":"قلم",
    "🚶🏾‍♀️":"يسير",
    "👊":"ضربة",
    "◾️":"وقف",
    "😚":"حب",
    "🔸":"مرح",
    "👎🏻":"لا يعجبني",
    "👊🏽":"ضربة",
    "😙":"حب",
    "🎥":"تصوير",
    "👉":"جذب انتباه",
    "👏🏽":"يصفق",
    "💪🏻":"عضلات",
    "🏴":"اسود",
    "🔥":"حريق",
    "😬":"عدم الراحة",
    "👊🏿":"يضرب",
    "📚" : "كتب",
    "📌" : "علق",
    "🌿":"ورقه شجره",
    "✋🏼":"كف ايد",
    "👐":"ايدي مفتوحه",
    "☠️":"وجه مرعب",
    "🎉":"يهنئ",
    "🔕" :"صامت",
    "😿":"وجه حزين",
    "☹️":"وجه يائس",
    "😘" :"حب",
    "😰" :"خوف و حزن",
    "🌼":"ورده",
    "💋": "بوسه",
    "👇":"لاسفل",
    "❣️":"حب",
    "🎧":"سماعات",
    "📝":"يكتب",
    "😇":"دايخ",
    "😈":"رعب",
    "🏃":"يجري",
    "✌🏻":"علامه النصر",
    "🔫":"يضرب",
    "❗️":"تعجب",
    "👎":"غير موافق",
    "🔐":"قفل",
    "👈":"لليمين",
    "™":"رمز",
    "🚶🏽":"يتمشي",
    "😯":"متفاجأ",
    "✊":"يد مغلقه",
    "😻":"اعجاب",
    "🙉" :"قرد",
    "👧":"طفله صغيره",
    "🔴":"دائره حمراء",
    "💪🏽":"قوه",
    "💤":"ينام",
    "👀":"ينظر",
    "✍🏻":"يكتب",
    "❄️":"تلج",
    "💀":"رعب",
    "😤":"وجه عابس",
    "🖋":"قلم",
    "🎩":"كاب",
    "☕️":"قهوه",
    "😹":"ضحك",
    "💓":"حب",
    "☄️":"نار",
    "👻":"رعب",
    "✋": "يد",
    "🌱": "نبتة",

    # Emoticons
    ":)" : "يبتسم",
    "(:" : "يبتسم",
    ":(" : "حزين",
    "xD" : "يضحك",
    ":=(": "يبكي",
    ":'(": "حزن",
    ":'‑(": "حزن",
    "XD" : "يضحك",
    ":D" : "يبتسم",
    "♬" : "موسيقي",
    "♡" : "حب",
    "☻"  : "يبتسم",
}

def replace_emojis(text):
    pattern = re.compile('|'.join(re.escape(key) for key in emojis.keys()))
    replaced_text = pattern.sub(lambda match: emojis[match.group(0)] + ' ', text)
    return emoji.replace_emoji(replaced_text, '')

data["tweet"] = data["tweet"].apply(lambda document: replace_emojis(document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق #الدمام × #الخبر ابعدواا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو ، أروبا


## Removing mentions

In [11]:
pattern = r'@[\w]+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق #الدمام × #الخبر ابعدواا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو ، أروبا


## Removing links

In [12]:
pattern = r'https?://\S+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق #الدمام × #الخبر ابعدواا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو ، أروبا


## Remove foriegn words

The text includes english, japanese and words for other languages

In [13]:
pattern = r'[a-zA-Z]+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق #الدمام × #الخبر ابعدواا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو ، أروبا


## Remove punctuations & special chars

In [14]:
pattern = r'[^\w\s\u0600-\u06FF]+|_|ﷺ|۩|⓵|؟|؛|۞|ﷻ|،| ٰ'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدواا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو أروبا


## Remove consecutive characters

In [15]:
pattern = r'(.)\1+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, r'\1', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدوا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو أروبا


## Remove tatweel

In [16]:
data["tweet"] = data["tweet"].apply(lambda document: araby.strip_tatweel(document))

## Remove numbers

In [17]:
pattern = r'\d+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدوا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو أروبا


## Remove extra whitespaces
In this step we get rid of extra whitespaces as well as new lines

In [18]:
pattern = r'\s+|\n+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدوا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو أروبا


## Remove harakat

In [19]:
data["tweet"] = data["tweet"].apply(lambda document: araby.strip_tashkeel(document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدوا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو أروبا


## Remove diactrics

In [20]:
data["tweet"] = data["tweet"].apply(lambda document: araby.strip_diacritics(document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدوا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الأصدقاء حب شاطئ فلامنغو أروبا


## Normalize hamza

In [21]:
pattern = r"أ|إ|آ"
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, 'ا', document))
data.head()

Unnamed: 0,class,tweet
33372,pos,بخاخ فيجا الالماني انتصاب وتاخير دون تخدير
41159,pos,شهور و يوم تعال خذ الزرقاء من فريق التماسيح
18029,neg,حذر طريق الدمام الخبر ابعدوا عنه
37264,pos,احلى صباح يضحك
35690,pos,مع بعض الاصدقاء حب شاطئ فلامنغو اروبا


## Tokenization

In [22]:
data["tweet"] = data["tweet"].apply(lambda document: araby.tokenize(document))
data.head()

Unnamed: 0,class,tweet
33372,pos,"[بخاخ, فيجا, الالماني, انتصاب, وتاخير, دون, تخ..."
41159,pos,"[شهور, و, يوم, تعال, خذ, الزرقاء, من, فريق, ال..."
18029,neg,"[حذر, طريق, الدمام, الخبر, ابعدوا, عنه]"
37264,pos,"[احلى, صباح, يضحك]"
35690,pos,"[مع, بعض, الاصدقاء, حب, شاطئ, فلامنغو, اروبا]"


## Remove long & short words

In [23]:
data["tweet"] = data["tweet"].apply(lambda document: [word for word in document if len(word) < 9 and len(word) > 1])
data.head()

Unnamed: 0,class,tweet
33372,pos,"[بخاخ, فيجا, الالماني, انتصاب, وتاخير, دون, تخ..."
41159,pos,"[شهور, يوم, تعال, خذ, الزرقاء, من, فريق, التما..."
18029,neg,"[حذر, طريق, الدمام, الخبر, ابعدوا, عنه]"
37264,pos,"[احلى, صباح, يضحك]"
35690,pos,"[مع, بعض, الاصدقاء, حب, شاطئ, فلامنغو, اروبا]"


## Stemming


In [24]:
ar_stemmer = stemmer("arabic")
data["tweet"] = data["tweet"].apply(lambda doc: [ar_stemmer.stemWord(token) for token in doc])

In [25]:
data.head()

Unnamed: 0,class,tweet
33372,pos,"[خاخ, فيج, المان, انتصاب, تاخير, دون, تخدير]"
41159,pos,"[شهور, يوم, تعال, خذ, زرقاء, من, ريق, تماسيح]"
18029,neg,"[حذر, طريق, دمام, خبر, ابعد, عنه]"
37264,pos,"[احلي, صباح, يضح]"
35690,pos,"[مع, بعض, اصدقاء, حب, شاطء, امنغ, اروب]"


## Removing stop words

In [26]:
arabic_stopwords = stopwords.words('arabic')
arabic_stopwords.extend(stp.stopwords_list())
stop_words = {ar_stemmer.stemWord(entry) for entry in arabic_stopwords}
with open("arabic_stopwords.txt", "r", encoding="UTF-8") as file:
    for word in file:
        stop_words.add(ar_stemmer.stemWord(word.strip()))

In [27]:
open("./models/stopwords.pkl", "wb").write(pickle.dumps(stop_words))

19241

In [28]:
def remove_stopwords(document: str) -> str:
    words = set(document.split(" "))
    return " ".join(list(words - stop_words))

data["tweet"] = data["tweet"].apply(lambda document: " ".join([token for token in document if token not in stop_words]))
data.head()

Unnamed: 0,class,tweet
33372,pos,خاخ فيج المان انتصاب تاخير تخدير
41159,pos,شهور تعال خذ زرقاء ريق تماسيح
18029,neg,حذر طريق دمام ابعد
37264,pos,احلي يضح
35690,pos,اصدقاء حب شاطء امنغ اروب


## Save preprocessed data

In [29]:
if not os.path.exists("preprocessed_data.csv"):
    # remove empty entries
    # data.replace('', pd.NA, inplace=True)  # Replace empty strings with NA
    # data.dropna(inplace=True)  # Drop rows with NA values
    data.to_csv("preprocessed_data.csv") # inspect the resulting file to validate the preprocessing

# Text representation

# Appraisal analysis
Bow + G:AO

G:AO Appraisal Group by Attitude & Orientation — Total
frequency of appraisal groups with each possible combination of Attitude and Orientation, normalized by total number of appraisal groups in the text.

## Read lexicon

In [30]:
def excel_to_dict(file_path):
    workbook = openpyxl.load_workbook(file_path)
    data_dict = {}
    
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        values = [cell.value for cell in sheet['A'] if cell.value is not None]
        data_dict[sheet_name] = values
    
    return data_dict

# Example usage:
lexicon = excel_to_dict("./Arabic seed terms.xlsx")
print(lexicon)

{'affect_happ_neg': ['حَزِين ', 'قَانِط', 'جَزِع', 'مُحَطَّم ', 'كَئِيب ', 'مُحْبَط', 'مُؤْلِم', 'مُنْقَبِض', 'مُثِير لِلشَّفَقَة', 'مَهْمُوم', 'مُكْتَئِب', 'مُكَدَّر', 'مُمِل', 'بَائِس', 'تَعِيس', 'مُحْزِن', 'قَاتِم', 'مُتَجَهِّم', 'مُنْكَسِر الخَاطِر', 'مُنْعَزِل', 'مُتَدَنِّي', 'مُغْتَم', 'بَكَّاء', 'بَاكٍ', 'دَامع'], 'affect_happ_pos': ['مَرِح', 'نَشِط', 'مُبْتَهِج'], 'affect_inc_neg': ['حَذِر', 'خَائِف', 'مَفْزُوع'], 'affect_satis_neg': ['سَطْحِي', 'مُبْتَذَل', 'مُمِل', 'مُنْزَعِج', 'غَاضِب', 'مَغِيظ', 'مُتَضَايِق', 'سَاخِط', 'سَئِيم'], 'affect_satis_pos': ['مَعْنِي', 'مُنْشَغِل ', 'مُنْهَمِك', 'رَاضٍ', 'مَسْرُور', 'مُعْجَب ', 'سَعِيد', 'مُنْبَهِر', 'مَفْتُون', 'مُثِير', 'مُثِير لِلْإِعْجَاب'], 'affect_sec_neg': ['مُرْتَبِك', 'قَلِق ', 'شَاذّ', 'مُفَاجِئ ', 'مُنْدَهِش', 'مَشْدُوه'], 'affect_sec_pos': ['وَاثِق', 'مُؤَكَّد', 'مُرِيح', 'وَاثِق مِن نَفْسِي', 'مَوْضِع ثِقَة'], 'apprec_comp_balance_neg': ['مُخْتَل', 'مُتَعَارِض', 'مُتَقَطِّع', 'مُتَفَاوِت', 'مَعِيب', 'مُتَنَاقِض', 'فَوْ

## Preprocess lexicon

In [31]:
ar_stemmer = stemmer("arabic")
for key, val in lexicon.items():
    for idx, val in enumerate(val):
        preprocessed_val = araby.strip_tashkeel(val)
        preprocessed_val = araby.strip_diacritics(preprocessed_val)
        preprocessed_val = re.sub(r"أ|إ|آ", 'ا', preprocessed_val)
        preprocessed_val = ar_stemmer.stemWord(preprocessed_val)
        lexicon[key][idx] = preprocessed_val
print(lexicon)

{'affect_happ_neg': ['حزين ', 'قانط', 'جزع', 'محطم ', 'ييب ', 'محبط', 'مولم', 'منقبض', 'مثير للشفق', 'مهموم', 'مكتيب', 'مكدر', 'ممل', 'بايس', 'تعيس', 'محز', 'قاتم', 'متج', 'منكسر الخاطر', 'منعزل', 'متد', 'مغتم', 'كاء', 'باك', 'دامع'], 'affect_happ_pos': ['مرح', 'نشط', 'مبتهج'], 'affect_inc_neg': ['حذر', 'خايف', 'مفزوع'], 'affect_satis_neg': ['سطح', 'مبتذل', 'ممل', 'منزعج', 'غاضب', 'مغيظ', 'متضايق', 'ساخط', 'سييم'], 'affect_satis_pos': ['معن', 'منشغل ', 'منهم', 'راض', 'مسرور', 'معجب ', 'سعيد', 'منبهر', 'مفتون', 'مثير', 'مثير للاعجاب'], 'affect_sec_neg': ['مرتب', 'قلق ', 'شاذ', 'مفاجي ', 'مندهش', 'مشد'], 'affect_sec_pos': ['واثق', 'موكد', 'مريح', 'واثق من نفس', 'موضع ثق'], 'apprec_comp_balance_neg': ['مختل', 'متعارض', 'متقطع', 'متفاو', 'معيب', 'متناقض', 'وضو', 'مشو', 'بشع', 'محرف'], 'apprec_comp_balance_pos': ['متواز', 'متناغم', 'موحد', 'متماثل', 'متناسب', 'ملايم', 'محترم', 'منطقي ', 'متناسق', 'رشيق', 'مرتب'], 'apprec_comp_complex_neg': ['معقد', 'مفرط', 'يزنط', 'غامض', 'مبهم', 'عكر ', 'ع

## Appraisal features

Construct a word to appraisal group mapping

In [32]:
word_to_appraisal_grp = {}
for key, val in lexicon.items():
    for values in val:
        word_to_appraisal_grp[values] = key
word_to_appraisal_grp

{'حزين ': 'affect_happ_neg',
 'قانط': 'affect_happ_neg',
 'جزع': 'affect_happ_neg',
 'محطم ': 'affect_happ_neg',
 'ييب ': 'affect_happ_neg',
 'محبط': 'affect_happ_neg',
 'مولم': 'affect_happ_neg',
 'منقبض': 'affect_happ_neg',
 'مثير للشفق': 'affect_happ_neg',
 'مهموم': 'affect_happ_neg',
 'مكتيب': 'affect_happ_neg',
 'مكدر': 'affect_happ_neg',
 'ممل': 'judg_esteem_cap_neg',
 'بايس': 'judg_esteem_norm_neg',
 'تعيس': 'affect_happ_neg',
 'محز': 'affect_happ_neg',
 'قاتم': 'affect_happ_neg',
 'متج': 'affect_happ_neg',
 'منكسر الخاطر': 'affect_happ_neg',
 'منعزل': 'affect_happ_neg',
 'متد': 'affect_happ_neg',
 'مغتم': 'affect_happ_neg',
 'كاء': 'affect_happ_neg',
 'باك': 'affect_happ_neg',
 'دامع': 'affect_happ_neg',
 'مرح': 'affect_happ_pos',
 'نشط': 'affect_happ_pos',
 'مبتهج': 'affect_happ_pos',
 'حذر': 'judg_esteem_ten_pos',
 'خايف': 'affect_inc_neg',
 'مفزوع': 'affect_inc_neg',
 'سطح': 'apprec_reaction_impact_neg',
 'مبتذل': 'apprec_valuation_neg',
 'منزعج': 'affect_satis_neg',
 'غاضب'

In [33]:
appraisal_grp_to_idx = {}
idx = 0
for appraisal_grp in list(lexicon.keys()):
    appraisal_grp_to_idx[appraisal_grp] = idx
    idx += 1
appraisal_grp_to_idx

{'affect_happ_neg': 0,
 'affect_happ_pos': 1,
 'affect_inc_neg': 2,
 'affect_satis_neg': 3,
 'affect_satis_pos': 4,
 'affect_sec_neg': 5,
 'affect_sec_pos': 6,
 'apprec_comp_balance_neg': 7,
 'apprec_comp_balance_pos': 8,
 'apprec_comp_complex_neg': 9,
 'apprec_comp_complex_pos': 10,
 'apprec_reaction_impact_neg': 11,
 'apprec_reaction_impact_pos': 12,
 'apprec_reaction_quality_neg': 13,
 'apprec_reaction_quality_pos': 14,
 'apprec_valuation_neg': 15,
 'apprec_valuation_pos': 16,
 'judg_esteem_cap_neg': 17,
 'judg_esteem_cap_pos': 18,
 'judg_esteem_norm_neg': 19,
 'judg_esteem_norm_pos': 20,
 'judg_esteem_ten_neg': 21,
 'judg_esteem_ten_pos': 22,
 'judg_sanction_prop_neg': 23,
 'judg_sanction_prop_pos': 24,
 'judg_sanction_ver_neg': 25,
 'judg_sanction_ver_pos': 26,
 'verb_affect_happ_neg': 27,
 'verb_affect_happ_pos': 28,
 'verb_affect_inc_pos': 29}

In [34]:
def appraisal_features(document: str) -> List[float]:
    res = np.zeros(len(lexicon))
    tokens = [word for word in document.split(" ")]
    count_appraisal_grps = 0
    for token in tokens:
        if token in word_to_appraisal_grp:
            res[ appraisal_grp_to_idx[word_to_appraisal_grp[token]] ] += 1
            count_appraisal_grps += 1
    # normalize features by the count of appraisal groups if the count != 0
    res = res / count_appraisal_grps if count_appraisal_grps != 0 else res
    return res
    

Split data

In [35]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = SEED, stratify = data["class"])
vectorizer = CountVectorizer()

X_train_BOW = vectorizer.fit_transform(X_train).toarray()
X_test_BOW = vectorizer.transform(X_test).toarray()

X_train_BOW[0].shape

(24742,)

construct training and testing appraisal feature matrix

In [36]:
X_train_appraisal = np.array(
    [appraisal_features(document) for document in X_train]
)

X_train_appraisal.shape

(23642, 30)

In [37]:
X_test_appraisal = np.array(
    [appraisal_features(document) for document in X_test]
)

X_test_appraisal.shape

(5911, 30)

### Features' union

In [38]:
X_train = np.hstack((X_train_BOW, X_train_appraisal))
X_test  = np.hstack((X_test_BOW, X_test_appraisal))

print(X_train.shape)
print(X_test.shape)

(23642, 24772)
(5911, 24772)


# Performance evaluation
## Naive bayes

In [39]:
if os.path.exists("./models/ASTC/APPRAISAL/NB.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/NB.pkl", 'rb').read()
    model: GaussianNB = pickle.loads(model)
else:
    model = GaussianNB()
    model.fit(X_train, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/NB.pkl", 'wb').write(mdl_bytes)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 58.53%
Precesion : 55.61%
Recall : 75.55%
F1 score : 64.07%


#### Logistic regression

In [40]:
if os.path.exists("./models/ASTC/APPRAISAL/LR.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/LR.pkl", 'rb').read()
    model: LogisticRegression = pickle.loads(model)
else:
    model = LogisticRegression(random_state = SEED, max_iter = 1500)
    model.fit(X_train, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/LR.pkl", 'wb').write(mdl_bytes)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 87.50%
Precesion : 85.21%
Recall : 90.08%
F1 score : 87.58%


#### SVM

We were unable to train this model on our machines using the initial dataset, due to the **curse of dimensionality**, so we added dimensioanlity reduction

In [41]:
if os.path.exists("./models/ASTC/APPRAISAL/SVM_Pipeline.pkl"):
    # read pipeline from disk
    pipeline = open("./models/ASTC/APPRAISAL/SVM_Pipeline.pkl", 'rb').read()
    pipeline = pickle.loads(pipeline)
else:
    steps = [
        ('pca', PCA(n_components = 150, random_state = SEED)),
        ('svm', SVC(random_state = SEED))
    ]
    pipeline = Pipeline(steps = steps)
    pipeline.fit(X_train, y_train)
    # write pipeline to disk
    pipeline_bytes = pickle.dumps(pipeline)
    open("./models/ASTC/APPRAISAL/SVM_Pipeline.pkl", 'wb').write(pipeline_bytes)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label = "pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 86.84%
Precesion : 82.48%
Recall : 92.81%
F1 score : 87.34%


#### Random forest

In [42]:
if os.path.exists("./models/ASTC/APPRAISAL/RF.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/RF.pkl", 'rb').read()
    model = pickle.loads(model)
else:
    model = RandomForestClassifier(random_state = SEED)
    model.fit(X_train, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/RF.pkl", 'wb').write(mdl_bytes)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 86.84%
Precesion : 82.48%
Recall : 92.81%
F1 score : 87.34%


# Using appraisal features only (G:AO)

## Naive bayes

In [43]:
if os.path.exists("./models/ASTC/APPRAISAL/GAO/NB.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/GAO/NB.pkl", 'rb').read()
    model: GaussianNB = pickle.loads(model)
else:
    model = GaussianNB()
    model.fit(X_train_appraisal, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/GAO/NB.pkl", 'wb').write(mdl_bytes)

y_pred = model.predict(X_test_appraisal)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 51.14%
Precesion : 50.04%
Recall : 95.30%
F1 score : 65.62%


## Lostic regression

In [44]:
if os.path.exists("./models/ASTC/APPRAISAL/GAO/LR.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/GAO/LR.pkl", 'rb').read()
    model: LogisticRegression = pickle.loads(model)
else:
    model = LogisticRegression(random_state = SEED, max_iter = 1500)
    model.fit(X_train_appraisal, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/GAO/LR.pkl", 'wb').write(mdl_bytes)

y_pred = model.predict(X_test_appraisal)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 64.86%
Precesion : 80.12%
Recall : 37.48%
F1 score : 51.07%


## SVM

We were unable to train this model on our machines using the initial dataset, due to the **curse of dimensionality**, so we added dimensioanlity reduction

In [45]:
if os.path.exists("./models/ASTC/APPRAISAL/GAO/SVM_Pipeline.pkl"):
    # read pipeline from disk
    pipeline = open("./models/ASTC/APPRAISAL/GAO/SVM_Pipeline.pkl", 'rb').read()
    pipeline = pickle.loads(pipeline)
else:
    steps = [
        ('svm', SVC(random_state = SEED))
    ]
    pipeline = Pipeline(steps = steps)
    pipeline.fit(X_train_appraisal, y_train)
    # write pipeline to disk
    pipeline_bytes = pickle.dumps(pipeline)
    open("./models/ASTC/APPRAISAL/GAO/SVM_Pipeline.pkl", 'wb').write(pipeline_bytes)

y_pred = pipeline.predict(X_test_appraisal)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label = "pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 64.73%
Precesion : 80.13%
Recall : 37.10%
F1 score : 50.72%


## Random forest

In [46]:
if os.path.exists("./models/ASTC/APPRAISAL/GAO/RF.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/GAO/RF.pkl", 'rb').read()
    model = pickle.loads(model)
else:
    model = RandomForestClassifier(random_state = SEED)
    model.fit(X_train_appraisal, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/GAO/RF.pkl", 'wb').write(mdl_bytes)

y_pred = pipeline.predict(X_test_appraisal)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 64.73%
Precesion : 80.13%
Recall : 37.10%
F1 score : 50.72%


## Serialize data

In [49]:
from appraisal import AppraisalUtils

utils = AppraisalUtils(lexicon, word_to_appraisal_grp, appraisal_grp_to_idx)

In [50]:
open("./models/ASTC/APPRAISAL/vectorizer.pkl", "wb").write(pickle.dumps(utils))

6592