### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from arabic_reshaper import ArabicReshaper
from bidi.algorithm import get_display
import nltk
import emoji
import csv
from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from qalsadi import lemmatizer

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/madboly/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/madboly/nltk_data...


True

### Load Data

In [3]:
file_path = 'train.xlsx'
data = pd.read_excel(file_path)
data['review_description'] = data['review_description'].apply(lambda x: x.encode('utf-8').decode('utf-8'))
data['review_description'] = data['review_description'].astype(str)

TypeError: string indices must be integers

In [None]:
data.info()

In [None]:
data.head()

### Basic preprocessing

##### Checking for null values

In [None]:
data.isnull().sum()

##### Lowercase 

In [None]:
data['preprocessed_review'] = data['review_description'].str.lower()

##### Dealing with duplicated reviews


In [None]:
data.duplicated().sum()

In [None]:
data = data.drop_duplicates(subset='preprocessed_review')
data.duplicated().sum()

### Cleaning text

In [None]:
def remove_punctuation(text):
    punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
    text = re.sub('[%s]' % re.escape(punctuations), ' ', text)
    return text

In [None]:
def remove_digits(text):
    return re.sub('\d+', '', text)

In [None]:
def remove_diacritics(text):
    return re.sub(r"[ًًٌٍَُِّْ]", "", text)

In [None]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("[يى]", "ي", text) 
    text = re.sub("[ؤئ]", "ء", text) 
    text = re.sub("ة", "ه", text)
    text = re.sub("ـ", "", text)
    text = re.sub("گ", "ك", text)
    return text

In [None]:
def remove_repeating_char(text):
    # Remove 3+ repeated consecutive characters
    return re.sub(r'(.)\1{2,}', r'\1', text)

In [None]:
def remove_long_words(text, threshold=15):
    return ' '.join(word for word in text.split(" ") if len(word) < threshold)

#### Remove stopwords

In [None]:
stopwords_list = stopwords.words("english")+stopwords.words("arabic")
len(stopwords_list)

In [None]:
def remove_stopwords(text):
    text = ' '.join([word for word in word_tokenize(text) if word not in stopwords_list])
    return text

#### Replace emojis

In [None]:
"""
Source:
https://github.com/a-ibrahimi/Arabic-Emojipedia
https://stackoverflow.com/a/76419165/13218954
"""
def build_emoji_dictionary():
    csv_file = 'emojis.csv'
    emoji_dict = {}
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            emoji = row[0]
            text = row[1]
            emoji_dict[emoji] = text
    return emoji_dict

def replace_emojis(text):
    emoji_dict = build_emoji_dictionary()
    emojis = emoji.emoji_list(text)
    for emo in emojis:
        if emo['emoji'] in emoji_dict:
            # Replace the emoji with the corresponding text surrounded by spaces
            text = text.replace(emo['emoji'], ' ' + emoji_dict[emo['emoji']] + ' ')
    return text

#### Lemmatization

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
arabic_lemmatizer = lemmatizer.Lemmatizer()

In [None]:
def lemmatize_word(word):
    try:
        lang = detect(word)
    except:
        return word
    
    if lang == 'en':
        return wordnet_lemmatizer.lemmatize(word)
    elif lang == 'ar':
        return arabic_lemmatizer.lemmatize(word)
    else:
        return word

def lemmatize_multilingual_text(text):
    lemmatized_words = [lemmatize_word(word) for word in text.split()]
    return ' '.join(lemmatized_words)

In [None]:
def preprocess(text):
    text = remove_punctuation(text)
    text = remove_digits(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    text = remove_repeating_char(text)
    text = remove_long_words(text)
    text = remove_stopwords(text)
    text = replace_emojis(text)
    # Remove unhandled emojis/invalid characters
    text = re.sub(r'[^\w\s]','', text)
    # Collapse any consecutive spaces to a single space
    text = re.sub('\s+', ' ', text).strip()
    text = lemmatize_multilingual_text(text)
    return text

In [None]:
data['preprocessed_review'] = data['preprocessed_review'].apply(preprocess)

In [None]:
# Extract a feature
data['preprocessed_review_length'] = data['preprocessed_review'].apply(len)

In [None]:
# Check duplicates and nulls again (if a review became empty)
print(data['preprocessed_review'].isnull().sum())
print(data.duplicated().sum())

In [None]:
data.head()

In [None]:
data.to_csv("preprocessed_train.csv")

### EDA

In [None]:
# Sentiment Distribution
sentiment_counts = data['rating'].value_counts()

# Review Length Analysis
data['review_length'] = data['review_description'].apply(len)

# Plotting the sentiment distribution
plt.figure(figsize=(10, 5))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.xticks(ticks=[0, 1, 2], labels=['Negative (-1)', 'Neutral (0)', 'Positive (1)'])
plt.show()

# Plotting the distribution of review lengths
plt.figure(figsize=(10, 5))
sns.histplot(data['review_length'], bins=50)
plt.title('Review Length Distribution')
plt.xlabel('Review Length (characters)')
plt.ylabel('Number of Reviews')
plt.show()

In [None]:
sentiment_counts

In [None]:
data['review_length'].describe()