<a href="https://colab.research.google.com/github/WafaaMSawan/AI/blob/main/arabic_preprocessor_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile arabic_preprocessor.py
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer

nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)

class ArabicPreprocessor:

    def __init__(self, remove_stopwords=True, apply_stemming=True):
        self.remove_stopwords_flag = remove_stopwords
        self.apply_stemming_flag = apply_stemming

        self.stop_words = set(stopwords.words("arabic"))
        self.stemmer = ISRIStemmer()

    def normalize_arabic(self, text):
        text = re.sub("[إأآا]", "ا", text)
        text = re.sub("ى", "ي", text)
        text = re.sub("ؤ", "ء", text)
        text = re.sub("ئ", "ء", text)
        text = re.sub("ة", "ه", text)
        text = re.sub(r"[^\w\s]", " ", text)
        return text

    def remove_numbers(self, text):
        return re.sub(r"\d+", " ", text)

    def remove_whitespaces(self, text):
        return " ".join(text.split())

    def tokenize(self, text):
        return word_tokenize(text)

    def filter_stopwords(self, tokens):
        if not self.remove_stopwords_flag:
            return tokens
        return [t for t in tokens if t not in self.stop_words]

    def stem_tokens(self, tokens):
        if not self.apply_stemming_flag:
            return tokens
        return [self.stemmer.stem(t) for t in tokens]

    def preprocess(self, text):
        text = self.normalize_arabic(text)
        text = self.remove_numbers(text)
        text = self.remove_whitespaces(text)

        tokens = self.tokenize(text)
        tokens = self.filter_stopwords(tokens)
        tokens = self.stem_tokens(tokens)

        return tokens

    def preprocess_corpus(self, corpus):
        return [self.preprocess(doc) for doc in corpus]


Writing arabic_preprocessor.py
