In [1]:
import pandas as pd
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from tashaphyne.stemming import ArabicLightStemmer

In [2]:
# Download NLTK resources if not already available
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the Arabic Light Stemmer
ArListem = ArabicLightStemmer()

# Define Arabic and English punctuation
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

# Normalization function for Arabic
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

# Define Arabic stopwords
arabic_stopwords = set(stopwords.words('arabic'))

# Preprocessing function to clean and stem text data
def preprocess_text(text):
    # Normalize text
    text = normalize_arabic(text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords, convert to lowercase
    cleaned_tokens = [word.lower() for word in tokens if word not in punctuations_list and word not in arabic_stopwords]
    # Stem each token using Arabic Light Stemmer
    stemmed_tokens = [ArListem.light_stem(word) or ArListem.get_root() for word in cleaned_tokens]
    return ' '.join(stemmed_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Musae\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Musae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load the dataset
df = pd.read_csv('C:\\Users\\Musae\\Documents\\GitHub-REPOs\\NLP-Project\\data\\ar_reviews_100k.tsv', sep='\t')

# Apply preprocessing to the text column
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Predict the sentiment labels on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the performance of the classifier
accuracy = metrics.accuracy_score(y_test, y_pred)
report = metrics.classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Naive Bayes Classifier Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Naive Bayes Classifier Accuracy: 0.63
Classification Report:
              precision    recall  f1-score   support

       Mixed       0.56      0.54      0.55      6722
    Negative       0.67      0.69      0.68      6603
    Positive       0.67      0.67      0.67      6675

    accuracy                           0.63     20000
   macro avg       0.63      0.63      0.63     20000
weighted avg       0.63      0.63      0.63     20000



In [4]:
from sklearn.svm import SVC

# Initialize and train the SVC model
svc_classifier = SVC(kernel='linear', C=1.0, random_state=42)
svc_classifier.fit(X_train_tfidf, y_train)

# Predict the sentiment labels on the test set
y_pred_svc = svc_classifier.predict(X_test_tfidf)

# Evaluate the performance of the SVC classifier
accuracy_svc = metrics.accuracy_score(y_test, y_pred_svc)
report_svc = metrics.classification_report(y_test, y_pred_svc)

# Print the evaluation metrics
print(f"SVC Classifier Accuracy: {accuracy_svc:.2f}")
print("Classification Report:")
print(report_svc)


SVC Classifier Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

       Mixed       0.57      0.56      0.56      6722
    Negative       0.71      0.69      0.70      6603
    Positive       0.67      0.70      0.68      6675

    accuracy                           0.65     20000
   macro avg       0.65      0.65      0.65     20000
weighted avg       0.65      0.65      0.65     20000

