In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset
df = pd.read_csv('C:\\Users\\Musae\\Documents\\GitHub-REPOs\\NLP-Project\\data\\ar_reviews_100k.tsv', sep='\t')
df.columns = ['label', 'text']

# Drop mixed labels and duplicates
df = df[df['label'] != 'Mixed']
df = df.drop_duplicates()

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('arabic'))

# Define punctuations
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

# Function to remove diacritics and emojis
def clean_text(text):
    text = re.sub('[\u0617-\u061A\u064B-\u0652]', '', text)  # Remove diacritics
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = re.sub(r'[\U0001F600-\U0001F64F]', '', text)  # Remove emojis
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.3, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

# Function to train and evaluate a model
def train_evaluate_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    return accuracy, conf_matrix

# Train and evaluate models
models = {
    'SVC': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
}

for model_name, model in models.items():
    accuracy, conf_matrix = train_evaluate_model(model, x_train_tfidf, y_train, x_test_tfidf, y_test)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Confusion Matrix:\n{conf_matrix}\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Musae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


SVC Accuracy: 0.8495
SVC Confusion Matrix:
[[8350 1626]
 [1385 8639]]

Logistic Regression Accuracy: 0.8464
Logistic Regression Confusion Matrix:
[[8403 1573]
 [1500 8524]]

Random Forest Accuracy: 0.8212
Random Forest Confusion Matrix:
[[8359 1617]
 [1960 8064]]

Naive Bayes Accuracy: 0.8316
Naive Bayes Confusion Matrix:
[[8626 1350]
 [2018 8006]]

