<a href="https://colab.research.google.com/github/OtmaneDaoudi/Arabic-sentiment-analysis/blob/main/arabic_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependencies

In [None]:
# !pip install emoji
# !pip install Arabic-Stopwords
# !pip install seaborn
# !pip install matplotlib
# !pip install soyclustering

# Libs imports

In [None]:
import os
import re
import nltk
import emoji
import pickle

import arabicstopwords.arabicstopwords as stp
import pandas as pd
import pyarabic.araby as araby
import numpy as np 

from nltk.corpus import stopwords
from collections import defaultdict
from math import log
from snowballstemmer import stemmer

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation, PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline

from typing import List

nltk.download('stopwords')

SEED = 21

# Importing data

In [None]:
data = pd.read_csv("./datasets/ASTC/data.tsv", header = 0, sep='\t', names = ["class", "tweet"]).sample(frac = 1, random_state = SEED)
data.head(20)

In [None]:
data.info()

# Data preprocessing

Our preprocessing pipeline contains the following steps:

1.  Remove duplicat entries
2.  Replacing emojies & emoticons
3.  Remove mentions
4.  Remove Links
5.  Remove whitespaces
6.  Remove punctuations & Special chars
7.  Remove Consecutive characters
8.  Tokenization
9.  Remove foreign words
10. Remove stop words
11. Remove numbers
12. Stemming


## Removing duplicates

In [None]:
count = data.duplicated().sum()
print(f"{(count / data.shape[0]) * 100:.1f}% of the data are duplicats")

In [None]:
data.drop_duplicates(inplace = True)

## Replacing emojies

In [None]:
emojis = {
    "🙂":"يبتسم",
    "😂":"يضحك",
    "🤣" : "يضحك",
    "💔":"قلب حزين",
    "🙂":"يبتسم",
    "❤️":"حب",
    "🥰":"حب",
    "🤐":"سكوت",
    "🧡":"حب",
    "❤":"حب",
    "😍":"حب",
    "😭":"يبكي",
    "🤭":"يبتسم",
    "😢":"حزن",
    "😔":"حزن",
    "♥":"حب",
    "💜":"حب",
    "😅":"يضحك",
    "🙁":"حزين",
    "💕":"حب",
    "💙":"حب",
    "😞":"حزين",
    "😊":"سعادة",
    "👏":"يصفق",
    "👌":"احسنت",
    "😴":"ينام",
    "😀":"يضحك",
    "✅":"صحيح",
    "🤪":"يضحك",
    "🏡" : "بيت",
    "🤲" : "دعاء",
    "💰" : "مال",
    "😌":"حزين",
    "🎁":"هدية",
    "🌹":"وردة",
    "🥀":"وردة",
    "📿":"وردة",
    "✍":"كتابة",
    "🙈":"حب",
    "😄":"يضحك",
    "😐":"محايد",
    "✌":"منتصر",
    "✨":"نجمه",
    "🤔":"تفكير",
    "😏":"يستهزء",
    "😒":"يستهزء",
    "🙄":"ملل",
    "😕":"عصبية",
    "😃":"يضحك",
    "🌸":"وردة",
    "😓":"حزن",
    "💞":"حب",
    "💗":"حب",
    "😑":"منزعج",
    "💭":"تفكير",
    "😎":"ثقة",
    "💛":"حب",
    "😩":"حزين",
    "🥺":"حزين",
    "💪":"عضلات",
    "👍":"موافق",
    "🙏🏻":"رجاء طلب",
    "😳":"مصدوم",
    "👏🏼":"تصفيق",
    "🎶":"موسيقي",
    "🌚":"صمت",
    "💚":"حب",
    "🙏":"رجاء طلب",
    "💘":"حب",
    "🍃":"سلام",
    "☺":"يضحك",
    "🎊":"يهنئ",
    "💥":"إنفجار",
    "😝":"يسخر",
    "💯":"تمام",
    "🐸":"ضفدع",
    "🤦‍♂️":"غبي",
    "🤩":"معجب",
    "🤤":"جائع",
    "😶":"مصدوم",
    "✌️":"مرح",
    "✋🏻":"توقف",
    "😉":"غمزة",
    "🌷":"حب",
    "🙃":"مبتسم",
    "😫":"حزين",
    "😨":"مصدوم",
    "🎼 ":"موسيقي",
    "🍁":"مرح",
    "🍂":"مرح",
    "💟":"حب",
    "😪":"حزن",
    "😆":"يضحك",
    "😣":"استياء",
    "☺️":"حب",
    "😱":"كارثة",
    "😁":"يضحك",
    "😖":"استياء",
    "🏃🏼":"يجري",
    "😡":"غضب",
    "🚶":"يسير",
    "🤕":"مرض",
    "🤮" : "يتقيئ",
    "⛔": "حذر",
    "‼️":"تعجب",
    "🕊":"طائر",
    "👌🏻":"احسنت",
    "❣":"حب",
    "🙊":"مصدوم",
    "💃":"سعادة مرح",
    "💃🏼":"سعادة مرح",
    "😜":"مرح",
    "👊":"ضربة",
    "😟":"استياء",
    "💖":"حب",
    "😥":"حزن",
    "🎻":"موسيقي",
    "✒":"يكتب",
    "🚶🏻":"يسير",
    "💎":"الماظ",
    "😷":"وباء مرض",
    "☝":"واحد",
    "🚬":"تدخين",
    "💐" : "ورد",
    "🌻" : "ورد",
    "🌞" : "شمس",
    "👆" : "الاول",
    "⚠️" :"تحذير",
    "🤗" : "احتواء",
    "✖️": "غلط",
    "📍"  : "مكان",
    "👸" : "ملكه",
    "👑" : "تاج",
    "✔️" : "صح",
    "💌": "قلب",
    "😲" : "مندهش",
    "💦": "ماء",
    "🚫" : "خطا",
    "👏🏻" : "برافو",
    "🏊" :"يسبح",
    "👍🏻": "تمام",
    "⭕️" :"دائره كبيره",
    "🎷" : "ساكسفون",
    "👋": "تلويح باليد",
    "✌🏼": "علامه النصر",
    "🌝":"مبتسم",
    "➿"  : "عقده مزدوجه",
    "💪🏼" : "قوي",
    "📩":  "تواصل معي",
    "☕️": "قهوه",
    "😧" : "قلق و صدمة",
    "🗨": "رسالة",
    "❗️" :"تعجب",
    "🙆🏻": "اشاره موافقه",
    "👯" :"اخوات",
    "©" :  "رمز",
    "👵🏽" :"سيده عجوزه",
    "🐣": "كتكوت",
    "🙌": "تشجيع",
    "🙇": "شخص ينحني",
    "👐🏽":"ايدي مفتوحه",
    "👌🏽": "بالظبط",
    "⁉️" : "استنكار",
    "⚽️": "كوره",
    "🕶" :"حب",
    "🎈" :"بالون",
    "🎀":    "ورده",
    "💵":  "فلوس",
    "😋":  "جائع",
    "😛":  "يغيظ",
    "😠":  "غاضب",
    "✍🏻":  "يكتب",
    "🌾":  "ارز",
    "👣":  "اثر قدمين",
    "❌":"رفض",
    "🍟":"طعام",
    "👬":"صداقة",
    "🐰":"ارنب",
    "🦋" : "فراشة",
    "☂":"مطر",
    "⚜":"مملكة فرنسا",
    "🐑":"خروف",
    "🗣":"صوت مرتفع",
    "👌🏼":"احسنت",
    "☘":"مرح",
    "😮":"صدمة",
    "😦":"قلق",
    "⭕":"الحق",
    "✏️":"قلم",
    "ℹ":"معلومات",
    "🙍🏻":"رفض",
    "⚪️":"نضارة نقاء",
    "🐤":"حزن",
    "💫":"مرح",
    "💝":"حب",
    "🍔":"طعام",
    "❤︎":"حب",
    "✈️":"سفر",
    "🏃🏻‍♀️":"يسير",
    "🍳":"ذكر",
    "🎤":"مايك غناء",
    "🎾":"كره",
    "🐔":"دجاجة",
    "🙋":"سؤال",
    "📮":"بحر",
    "💉":"دواء",
    "🙏🏼":"رجاء طلب",
    "💂🏿 ":"حارس",
    "🎬":"سينما",
    "♦️":"مرح",
    "💡":"قكرة",
    "‼":"تعجب",
    "👼":"طفل",
    "🔑":"مفتاح",
    "♥️":"حب",
    "🌲" : "شجرة",
    "🌳" : "شجرة",
    "🚩" : "حذر",
    "🚨" : "حذر",
    "🛑" : "حذر",
    "🕋":"كعبة",
    "🐓":"دجاجة",
    "💩":"معترض",
    "👽":"فضائي",
    "☔️":"مطر",
    "🍷":"عصير",
    "🌟":"نجمة",
    "☁️":"سحب",
    "👃":"معترض",
    "🌺":"مرح",
    "🔪":"سكينة",
    "♨":"سخونية",
    "👊🏼":"ضرب",
    "✏":"قلم",
    "🚶🏾‍♀️":"يسير",
    "👊":"ضربة",
    "◾️":"وقف",
    "😚":"حب",
    "🔸":"مرح",
    "👎🏻":"لا يعجبني",
    "👊🏽":"ضربة",
    "😙":"حب",
    "🎥":"تصوير",
    "👉":"جذب انتباه",
    "👏🏽":"يصفق",
    "💪🏻":"عضلات",
    "🏴":"اسود",
    "🔥":"حريق",
    "😬":"عدم الراحة",
    "👊🏿":"يضرب",
    "📚" : "كتب",
    "📌" : "علق",
    "🌿":"ورقه شجره",
    "✋🏼":"كف ايد",
    "👐":"ايدي مفتوحه",
    "☠️":"وجه مرعب",
    "🎉":"يهنئ",
    "🔕" :"صامت",
    "😿":"وجه حزين",
    "☹️":"وجه يائس",
    "😘" :"حب",
    "😰" :"خوف و حزن",
    "🌼":"ورده",
    "💋": "بوسه",
    "👇":"لاسفل",
    "❣️":"حب",
    "🎧":"سماعات",
    "📝":"يكتب",
    "😇":"دايخ",
    "😈":"رعب",
    "🏃":"يجري",
    "✌🏻":"علامه النصر",
    "🔫":"يضرب",
    "❗️":"تعجب",
    "👎":"غير موافق",
    "🔐":"قفل",
    "👈":"لليمين",
    "™":"رمز",
    "🚶🏽":"يتمشي",
    "😯":"متفاجأ",
    "✊":"يد مغلقه",
    "😻":"اعجاب",
    "🙉" :"قرد",
    "👧":"طفله صغيره",
    "🔴":"دائره حمراء",
    "💪🏽":"قوه",
    "💤":"ينام",
    "👀":"ينظر",
    "✍🏻":"يكتب",
    "❄️":"تلج",
    "💀":"رعب",
    "😤":"وجه عابس",
    "🖋":"قلم",
    "🎩":"كاب",
    "☕️":"قهوه",
    "😹":"ضحك",
    "💓":"حب",
    "☄️":"نار",
    "👻":"رعب",
    "✋": "يد",
    "🌱": "نبتة",

    # Emoticons
    ":)" : "يبتسم",
    "(:" : "يبتسم",
    ":(" : "حزين",
    "xD" : "يضحك",
    ":=(": "يبكي",
    ":'(": "حزن",
    ":'‑(": "حزن",
    "XD" : "يضحك",
    ":D" : "يبتسم",
    "♬" : "موسيقي",
    "♡" : "حب",
    "☻"  : "يبتسم",
}

def replace_emojis(text):
    pattern = re.compile('|'.join(re.escape(key) for key in emojis.keys()))
    replaced_text = pattern.sub(lambda match: emojis[match.group(0)] + ' ', text)
    return emoji.replace_emoji(replaced_text, '')

data["tweet"] = data["tweet"].apply(lambda document: replace_emojis(document))
data.head()

## Removing mentions

In [None]:
pattern = r'@[\w]+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

## Removing links

In [None]:
pattern = r'https?://\S+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

## Remove foriegn words

The text includes english, japanese and words for other languages

In [None]:
pattern = r'[a-zA-Z]+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

## Remove punctuations & special chars

In [None]:
pattern = r'[^\w\s\u0600-\u06FF]+|_|ﷺ|۩|⓵|؟|؛|۞|ﷻ|،| ٰ'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

## Remove consecutive characters

In [None]:
pattern = r'(.)\1+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, r'\1', document))
data.head()

## Remove tatweel

In [None]:
data["tweet"] = data["tweet"].apply(lambda document: araby.strip_tatweel(document))

## Remove numbers

In [None]:
pattern = r'\d+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

## Remove extra whitespaces
In this step we get rid of extra whitespaces as well as new lines

In [None]:
pattern = r'\s+|\n+'
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, ' ', document))
data.head()

## Remove harakat

In [None]:
data["tweet"] = data["tweet"].apply(lambda document: araby.strip_tashkeel(document))
data.head()

## Remove diactrics

In [None]:
data["tweet"] = data["tweet"].apply(lambda document: araby.strip_diacritics(document))
data.head()

## Normalize hamza

In [None]:
pattern = r"أ|إ|آ"
data["tweet"] = data["tweet"].apply(lambda document: re.sub(pattern, 'ا', document))
data.head()

## Tokenization

In [None]:
data["tweet"] = data["tweet"].apply(lambda document: araby.tokenize(document))
data.head()

## Remove long & short words

In [None]:
data["tweet"] = data["tweet"].apply(lambda document: [word for word in document if len(word) < 9 and len(word) > 1])
data.head()

## Stemming


In [None]:
ar_stemmer = stemmer("arabic")
data["tweet"] = data["tweet"].apply(lambda doc: [ar_stemmer.stemWord(token) for token in doc])

In [None]:
data.head()

## Removing stop words

In [None]:
arabic_stopwords = stopwords.words('arabic')
arabic_stopwords.extend(stp.stopwords_list())
stop_words = {ar_stemmer.stemWord(entry) for entry in arabic_stopwords}
with open("arabic_stopwords.txt", "r", encoding="UTF-8") as file:
    for word in file:
        stop_words.add(ar_stemmer.stemWord(word.strip()))

In [None]:
open("./models/stopwords.pkl", "wb").write(pickle.dumps(stop_words))

In [None]:
def remove_stopwords(document: str) -> str:
    words = set(document.split(" "))
    return " ".join(list(words - stop_words))

data["tweet"] = data["tweet"].apply(lambda document: " ".join([token for token in document if token not in stop_words]))
data.head()

## Save preprocessed data

In [None]:
if not os.path.exists("preprocessed_data.csv"):
    # remove empty entries
    # data.replace('', pd.NA, inplace=True)  # Replace empty strings with NA
    # data.dropna(inplace=True)  # Drop rows with NA values
    data.to_csv("preprocessed_data.csv") # inspect the resulting file to validate the preprocessing

# Text representation

# Appraisal analysis
Bow + G:AO

G:AO Appraisal Group by Attitude & Orientation — Total
frequency of appraisal groups with each possible combination of Attitude and Orientation, normalized by total number of appraisal groups in the text.

## Read lexicon

In [None]:
import openpyxl

def excel_to_dict(file_path):
    workbook = openpyxl.load_workbook(file_path)
    data_dict = {}
    
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        values = [cell.value for cell in sheet['A'] if cell.value is not None]
        data_dict[sheet_name] = values
    
    return data_dict

# Example usage:
file_path = 'example.xlsx'  # Replace 'example.xlsx' with your file path
lexicon = excel_to_dict("./Arabic seed terms.xlsx")
print(lexicon)

## Preprocess lexicon

In [40]:
ar_stemmer = stemmer("arabic")
for key, val in lexicon.items():
    for idx, val in enumerate(val):
        preprocessed_val = araby.strip_tashkeel(val)
        preprocessed_val = araby.strip_diacritics(preprocessed_val)
        preprocessed_val = re.sub(r"أ|إ|آ", 'ا', preprocessed_val)
        preprocessed_val = ar_stemmer.stemWord(preprocessed_val)
        lexicon[key][idx] = preprocessed_val
print(lexicon)

{'affect_happ_neg': ['حزين ', 'قانط', 'جزع', 'محطم ', 'ييب ', 'محبط', 'مولم', 'منقبض', 'مثير للشفق', 'مهموم', 'مكتيب', 'مكدر', 'ممل', 'بايس', 'تعيس', 'محز', 'قاتم', 'متج', 'منكسر الخاطر', 'منعزل', 'متد', 'مغتم', 'كاء', 'باك', 'دامع'], 'affect_happ_pos': ['مرح', 'نشط', 'مبتهج'], 'affect_inc_neg': ['حذر', 'خايف', 'مفزوع'], 'affect_satis_neg': ['سطح', 'مبتذل', 'ممل', 'منزعج', 'غاضب', 'مغيظ', 'متضايق', 'ساخط', 'سييم'], 'affect_satis_pos': ['معن', 'منشغل ', 'منهم', 'راض', 'مسرور', 'معجب ', 'سعيد', 'منبهر', 'مفتون', 'مثير', 'مثير للاعجاب'], 'affect_sec_neg': ['مرتب', 'قلق ', 'شاذ', 'مفاجي ', 'مندهش', 'مشد'], 'affect_sec_pos': ['واثق', 'موكد', 'مريح', 'واثق من نفس', 'موضع ثق'], 'apprec_comp_balance_neg': ['مختل', 'متعارض', 'متقطع', 'متفا', 'معيب', 'متناقض', 'وضو', 'مشو', 'بشع', 'محرف'], 'apprec_comp_balance_pos': ['متواز', 'متناغم', 'موحد', 'متماثل', 'متناسب', 'ملايم', 'محترم', 'منطقي ', 'متناسق', 'رشيق', 'مرتب'], 'apprec_comp_complex_neg': ['معقد', 'مفرط', 'يزنط', 'غامض', 'مبهم', 'عكر ', 'عا

## Appraisal features

Construct a word to appraisal group mapping

In [None]:
word_to_appraisal_grp = {}
for key, val in lexicon.items():
    for values in val:
        word_to_appraisal_grp[values] = key
word_to_appraisal_grp

In [None]:
appraisal_grp_to_idx = {}
idx = 0
for appraisal_grp in list(lexicon.keys()):
    appraisal_grp_to_idx[appraisal_grp] = idx
    idx += 1
appraisal_grp_to_idx

In [49]:
def appraisal_features(document: str) -> List[float]:
    res = np.zeros(len(lexicon))
    tokens = [word for word in document.split(" ")]
    count_appraisal_grps = 0
    for token in tokens:
        if token in word_to_appraisal_grp:
            res[ appraisal_grp_to_idx[word_to_appraisal_grp[token]] ] += 1
            count_appraisal_grps += 1
    # normalize features by the count of appraisal groups if the count != 0
    res = res / count_appraisal_grps if count_appraisal_grps != 0 else res
    return res
    

Split data

In [47]:
X_train, X_test, y_train, y_test = train_test_split(data["tweet"], data["class"], test_size = 0.2, random_state = SEED, stratify = data["class"])
vectorizer = CountVectorizer()

X_train_BOW = vectorizer.fit_transform(X_train).toarray()
X_test_BOW = vectorizer.transform(X_test).toarray()

X_train_BOW[0].shape

(24742,)

construct training and testing appraisal feature matrix

In [60]:
X_train_appraisal = np.array(
    [appraisal_features(document) for document in X_train]
)

X_train_appraisal.shape

(23642, 30)

In [62]:
X_test_appraisal = np.array(
    [appraisal_features(document) for document in X_test]
)

X_test_appraisal.shape

(5911, 30)

### Features' union

In [65]:
X_train = np.hstack((X_train_BOW, X_train_appraisal))
X_test  = np.hstack((X_test_BOW, X_test_appraisal))

print(X_train.shape)
print(X_test.shape)

(23642, 24772)
(5911, 24772)


# Performance evaluation
## Naive bayes

In [66]:
if os.path.exists("./models/ASTC/APPRAISAL/NB.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/NB.pkl", 'rb').read()
    model: GaussianNB = pickle.loads(model)
else:
    model = GaussianNB()
    model.fit(X_train, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/NB.pkl", 'wb').write(mdl_bytes)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 58.53%
Precesion : 55.61%
Recall : 75.55%
F1 score : 64.07%


#### Logistic regression

In [67]:
if os.path.exists("./models/ASTC/APPRAISAL/LR.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/LR.pkl", 'rb').read()
    model: LogisticRegression = pickle.loads(model)
else:
    model = LogisticRegression(random_state = SEED, max_iter = 1500)
    model.fit(X_train, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/LR.pkl", 'wb').write(mdl_bytes)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 87.50%
Precesion : 85.21%
Recall : 90.08%
F1 score : 87.58%


#### SVM

We were unable to train this model on our machines using the initial dataset, due to the **curse of dimensionality**, so we added dimensioanlity reduction

In [68]:
if os.path.exists("./models/ASTC/APPRAISAL/SVM_Pipeline.pkl"):
    # read pipeline from disk
    pipeline = open("./models/ASTC/APPRAISAL/SVM_Pipeline.pkl", 'rb').read()
    pipeline = pickle.loads(pipeline)
else:
    steps = [
        ('pca', PCA(n_components = 150, random_state = SEED)),
        ('svm', SVC(random_state = SEED))
    ]
    pipeline = Pipeline(steps = steps)
    pipeline.fit(X_train, y_train)
    # write pipeline to disk
    pipeline_bytes = pickle.dumps(pipeline)
    open("./models/ASTC/APPRAISAL/SVM_Pipeline.pkl", 'wb').write(pipeline_bytes)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label = "pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 86.84%
Precesion : 82.48%
Recall : 92.81%
F1 score : 87.34%


#### Random forest

In [69]:
if os.path.exists("./models/ASTC/APPRAISAL/RF.pkl"):
    # read model from disk
    model = open("./models/ASTC/APPRAISAL/RF.pkl", 'rb').read()
    model = pickle.loads(model)
else:
    model = RandomForestClassifier(random_state = SEED)
    model.fit(X_train, y_train)
    # write model to disk
    mdl_bytes = pickle.dumps(model)
    open("./models/ASTC/APPRAISAL/RF.pkl", 'wb').write(mdl_bytes)

accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label="pos")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precesion : {precision * 100:.2f}%")
print(f"Recall : {recall * 100:.2f}%")
print(f"F1 score : {f1_score * 100:.2f}%")

Accuracy: 86.84%
Precesion : 82.48%
Recall : 92.81%
F1 score : 87.34%
