### Import Libraries

In [None]:
import pandas as pd
import re
import string
import nltk
import emoji
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from qalsadi import lemmatizer
from googletrans import Translator

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

### Load train data 

In [None]:
file_path = 'train.xlsx'
train = pd.read_excel(file_path)

In [None]:
train.info()

In [None]:
train.head()

### Basic preprocessing

##### Checking for null values

In [None]:
train.isnull().sum()

In [None]:
def remove_nulls_and_duplicates(df):
    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates()
    return df_cleaned

### Cleaning text

In [None]:
def remove_punctuation(text):
    punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
    text = re.sub('[%s]' % re.escape(punctuations), ' ', text)
    return text

In [None]:
def remove_digits(text):
    return re.sub('\d+', '', text)

In [None]:
def remove_diacritics(text):
    return re.sub(r"[ًًٌٍَُِّْ]", "", text)

In [None]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("[يى]", "ي", text) 
    text = re.sub("[ؤئ]", "ء", text) 
    text = re.sub("ة", "ه", text)
    text = re.sub("ـ", "", text)
    text = re.sub("گ", "ك", text)
    return text

In [None]:
def remove_repeating_char(text):
    # Remove 3+ repeated consecutive characters
    return re.sub(r'(.)\1{2,}', r'\1', text)

In [None]:
def remove_long_words(text, threshold=15):
    return ' '.join(word for word in text.split(" ") if len(word) < threshold)

In [None]:
def remove_non_arabic_words(text):
    """
    Source: https://gist.github.com/mohabmes/33b724edfd4f0f3ec2e6644168db516e#file-preprocess_arabic_text-py-L22
    """
    return re.sub(r'[^\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text)

#### Remove stopwords

In [None]:
stopwords_list = set(stopwords.words("english")+stopwords.words("arabic"))
words_to_keep = {'not', 'لا', 'ليس', 'مش', 'لم', 'لن', 'جدا', 'اكثر', 'قليل', 'كثير', 'حب', 'بكره', 'بحب', 'عجب', 'غير', 'كره'}
stopwords_list = stopwords_list.difference(words_to_keep)
len(stopwords_list)

In [None]:
def remove_stopwords(text):
    text = ' '.join([word for word in word_tokenize(text) if word not in stopwords_list])
    return text

#### Replace emojis

In [None]:
"""
Source:
https://github.com/a-ibrahimi/Arabic-Emojipedia
https://stackoverflow.com/a/76419165/13218954
"""
def build_emoji_dictionary():
    csv_file = 'emojis.csv'
    emoji_dict = {}
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            emoji = row[0]
            text = row[1]
            emoji_dict[emoji] = text
    return emoji_dict

def replace_emojis(text):
    emoji_dict = build_emoji_dictionary()
    emojis = emoji.emoji_list(text)
    for emo in emojis:
        if emo['emoji'] in emoji_dict:
            # Replace the emoji with the corresponding text surrounded by spaces
            text = text.replace(emo['emoji'], ' ' + emoji_dict[emo['emoji']] + ' ')
    return text

#### Translate non-Arabic words

In [None]:
translator = Translator()
# transliterator = FrancoArabicTransliterator()

#### Lemmatization

In [None]:
arabic_lemmatizer = lemmatizer.Lemmatizer()

In [None]:
def lemmatize_word(word):
    try:
        language = translator.detect(word).lang
    except:
        return word
    
    if language == 'en':
        try:
            translated_word = translator.translate(word, src='en', dest='ar').text
            if translated_word in stopwords_list:
                # to be removed later
                return word
            word = translated_word
        except:
            return word

    try:
        lemma = arabic_lemmatizer.lemmatize(word)
        return lemma
    except:
        return word

def lemmatize_multilingual_text(text):
    lemmatized_words = [lemmatize_word(word) for word in text.split()]
    return ' '.join(lemmatized_words)

In [None]:
def preprocess(text):
    text = str(text).lower()
    text = remove_punctuation(text)
    text = remove_digits(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    text = remove_repeating_char(text)
    text = remove_long_words(text)
    text = remove_stopwords(text)

    text = replace_emojis(text)
    # Remove unhandled emojis/invalid characters
    text = re.sub(r'[^\w\s]','', text)
    # Collapse any consecutive spaces to a single space
    text = re.sub('\s+', ' ', text).strip()
    text = lemmatize_multilingual_text(text)
    # Remove any not-translated non-arabic word
    text = remove_non_arabic_words(text)
    # Normalize again as some words have inconcsistent data from lemmatization  
    text = remove_diacritics(text)
    text = normalize_arabic(text)   
    text = remove_stopwords(text)
 
    return text

## Clean train data

In [None]:
def clean_df(df):
    # utf-8 encoding
    df['preprocessed_review'] = df['review_description'].apply(lambda x: x.encode('utf-8').decode('utf-8'))
    # convert it to string
    df['preprocessed_review'] = df['preprocessed_review'].astype(str)
    
    df = remove_nulls_and_duplicates(df)
    df['preprocessed_review'] = df['preprocessed_review'].apply(preprocess)
    # A review may become empty after removing stopwords
    df = remove_nulls_and_duplicates(df)
    return df

In [22]:
train = clean_df(train)

In [None]:
train.head()

In [None]:
train.to_csv("preprocessed_train.csv")

## Clean test data

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test.head()

In [None]:
test = clean_df(test)

In [None]:
test.head()

In [None]:
test.to_csv("preprocessed_test.csv")