In [5]:
import pandas as pd
import string  # Import the string module
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Use the preprocessing functions from the rule-based approach
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def preprocess_text(text, stopwords, punctuations):
    # Normalize Arabic text
    text = normalize_arabic(text)
    # Remove punctuation and stop words
    text = re.sub(r'\W+', ' ', text)
    words = text.split()
    return ' '.join([word for word in words if word not in punctuations and word not in stopwords])

# Load the dataset
df = pd.read_csv('C:\\Users\\Musae\\Documents\\GitHub-REPOs\\NLP-Project\\data\\ar_reviews_100k.tsv', sep='\t')

# Define Arabic and English punctuation
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

In [7]:
# Arabic stopwords (make sure to import NLTK stopwords and download)
from nltk.corpus import stopwords
arabic_stopwords = set(stopwords.words('arabic'))

# Preprocess the text
df['cleaned_text'] = df['text'].apply(lambda x: preprocess_text(x, arabic_stopwords, punctuations_list))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predict the sentiment labels on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the performance of the classifier
accuracy = metrics.accuracy_score(y_test, y_pred)
report = metrics.classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Naive Bayes Classifier Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Naive Bayes Classifier Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

       Mixed       0.57      0.56      0.57      6722
    Negative       0.68      0.69      0.69      6603
    Positive       0.68      0.67      0.68      6675

    accuracy                           0.64     20000
   macro avg       0.64      0.64      0.64     20000
weighted avg       0.64      0.64      0.64     20000

