In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re



In [2]:
# Load dataset
df = pd.read_csv('C:\\Users\\Musae\\Documents\\GitHub-REPOs\\NLP-Project\\data\\ar_reviews_100k.tsv', sep='\t')

# Drop mixed labels and duplicates
df = df[df['label'] != 'Mixed']
df = df.drop_duplicates()

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('arabic'))

# Define punctuations
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Musae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Function to remove diacritics
def remove_diacritics(text):
    arabic_diacritics = re.compile("""
        ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ
    """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)

# Function to remove emojis
def remove_emoji(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

# Function to clean text
def clean_text(text):
    text = "".join([char for char in text if char not in punctuations_list])
    text = remove_emoji(text)
    text = remove_diacritics(text)
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text

# Apply text cleaning
df['cleanedtext'] = df['text'].apply(clean_text)

# Function to process text (stemming)
def process_text(text):
    stemmer = nltk.ISRIStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Apply text processing
df['cleanedtextnew'] = df['cleanedtext'].apply(process_text)

# Split data into train and test sets
from sklearn.model_selection import train_test_split
x = df['cleanedtextnew']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Print data sizes
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

46666 46666
20000 20000


In [4]:
# Install LazyPredict if you haven't already
!pip install lazypredict



In [5]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)