# Import needed libraries

In [69]:
pip install pyarabic



In [70]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
import re  # Regular expression library for pattern matching and text manipulation
import pandas as pd  # Library for data manipulation and analysis, using DataFrames
import xml.etree.ElementTree as ET  # Library for parsing and working with XML data
import pyarabic.araby as araby  # Library for Arabic text processing, including normalization and tokenization
import numbers  # Library for numeric operations and handling
import nltk  # Natural Language Toolkit library for natural language processing tasks
from nltk.corpus import stopwords  # NLTK's corpus module for accessing stop words in various languages

# Text Processing Resources

In [72]:
# Special characters to be handled in text processing
special = ['~','!','@','#','$','%','^','&','*','(',')','_','-','+','=','{','}','?','>','<',':']

# List of common stop words in Arabic, used for text preprocessing
stop_words = ['من',
 'في',
 'على',
 'و',
 'فى',
 'يا',
 'عن',
 'مع',
 'ان',
 'هو',
 'علي',
 'ما',
 'اللي',
 'كل',
 'بعد',
 'ده',
 'اليوم',
 'أن',
 'يوم',
 'انا',
 'إلى',
 'كان',
 'ايه',
 'اللى',
 'الى',
 'دي',
 'بين',
 'انت',
 'أنا',
 'حتى',
 'لما',
 'فيه',
 'هذا',
 'واحد',
 'احنا',
 'اي',
 'كده',
 'إن',
 'او',
 'أو',
 'عليه',
 'ف',
 'دى',
 'مين',
 'الي',
 'كانت',
 'أمام',
 'زي',
 'يكون',
 'خلال',
 'ع',
 'كنت',
 'هي',
 'فيها',
 'عند',
 'التي',
 'الذي',
 'قال',
 'هذه',
 'قد',
 'انه',
 'ريتويت',
 'بعض',
 'أول',
 'ايه',
 'الان',
 'أي',
 'منذ',
 'عليها',
 'له',
 'ال',
 'تم',
 'ب',
 'دة',
 'عليك',
 'اى',
 'كلها',
 'اللتى',
 'هى',
 'دا',
 'انك',
 'وهو',
 'ومن',
 'منك',
 'نحن',
 'زى',
 'أنت',
 'انهم',
 'معانا',
 'حتي',
 'وانا',
 'عنه',
 'إلي',
 'ونحن',
 'وانت',
 'منكم',
 'وان',
 'معاهم',
 'معايا',
 'وأنا',
 'عنها',
 'إنه',
 'اني',
 'معك',
 'اننا',
 'فيهم',
 'د',
 'انتا',
 'عنك',
 'وهى',
 'معا',
 'آن',
 'انتي',
 'وأنت',
 'وإن',
 'ومع',
 'وعن',
 'معاكم',
 'معاكو',
 'معاها',
 'وعليه',
 'وانتم',
 'وانتي',
 '¿',
 '|']

stop_words = stopwords.words("arabic")
# Stemming Part: Prefixes, Articles, and Suffixes for Arabic word stemming

# List of articles in Arabic, used in stemming
articles = ['بال','فال','وال','كال','ولل','ال','ال', 'لي',' ا',' فبال','لبال','وبال']

# Lists of common prefixes in Arabic, used in stemming
p=['مست','فلىست' ,'الاست','افاست','اتست','اىست','فاست','ءاست'
    ,'انهم','ءانى','والم','باست','گمست','والا' ,'ولت','فلى',
   'فلن','فلل','فهو','فهم','فال','ىست','تست','است','فهى',
   'سيا','فلا','ءست','بمس','لىت','ل','ب','و','ف','س','ي',
   'ت','ون','فى','فب','فت','لي','فن','لل','وب','فا','ول',
   'وو','اف','ات','وى','وت','اا','ال','ست','سى','يس','يت',
   'گت','ىى','تت', 'اى' ]

p2_dash=['لا']

# Lists of common suffixes in Arabic, used in stemming
s1=['ت','گ','ي','ه']
s2=['وا','ون','هن','ان','وك','اك','اه','ها','لل','هم','كن','ات','ىن']
s3=['تنا','نها','تان','ناگ','ونه','ناه','هما','وعا','نهم','وهم','ونى','وعن','تها','تهم','نگم','هات','هان','تان','تهن','وگم','ونه','ونگ','انگ']
s4=['موهم','موهن','ناگم','نوهن','ونهم','ناهم','ونگم','توهم','اتها','اتهم','يانه','اءهم']
s5=['گموها','ناهما','ناگمو']

# Read Data
Data can be found here: https://drive.google.com/drive/folders/1lBsOoWkbNmCB5hRUFPThAHKYniURz2it?usp=sharing


In [73]:
def read_doc(doc):
    """
    Read a text document and split it into words.

    Parameters:
    - doc (str): The path to the text document.

    Returns:
    - words (list): A list of words extracted from the document.
    """
    # Open the file and read all lines
    with open(doc, encoding="utf8") as f:
        lines = f.readlines()

    words = []  # Initialize an empty list to store words

    # Split each line into words and add them to the words list
    for line in lines:
        words.extend(line.split())

    return words

In [74]:
def read_csv(csv):
    """
    Read a CSV file and extract data from the first two columns.

    Parameters:
    - csv (str): The path to the CSV file.

    Returns:
    - column1 (pd.Series): The data from the first column.
    - column2 (pd.Series): The data from the second column.
    """
    # Read the CSV file into a DataFrame
    data = pd.read_csv(csv)

    # Extract data from the first two columns into separate Series
    column1 = data.iloc[:, 0]  # First column
    column2 = data.iloc[:, 1]  # Second column

    return column1, column2

In [75]:
def read_xml(xml):
    """
    Read an XML file and extract data from specific elements.

    Parameters:
    - xml (str): The path to the XML file.

    Returns:
    - words (list): A list of words extracted from the XML.
    - labels (list): A list of labels extracted from the XML.
    """
    words = []  # List to store words
    labels = []  # List to store labels

    # Parse the XML file into an ElementTree object
    tree = ET.parse(xml)

    # Get the root element of the XML tree
    root = tree.getroot()

    # Iterate through 'analysis' elements in the XML tree
    for child in root.iter('analysis'):
        # Check if the 'a_id' attribute of the element is "1"
        if child.get('a_id') == "1":
            # Extract 'vowled' and 'stem' attributes and add them to the lists
            words.append(child.get('vowled'))
            labels.append(child.get('stem'))

    return words, labels


In [76]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
words,labels=read_xml("/content/drive/MyDrive/Arabic_Stemmer_Data/0450AbuHasanMawardi.HawiKabir-sample.xml")

###prepare Arabic document###########
Arabic_doc=[]
Arabic_doc=Arabic_doc+read_doc("/content/drive/MyDrive/Arabic_Stemmer_Data/0450AbuHasanMawardi.HawiKabir-sample.txt")+read_doc("/content/drive/MyDrive/Arabic_Stemmer_Data/0460ShaykhTusi.Mabsut-sample.txt")+read_doc("/content/drive/MyDrive/Arabic_Stemmer_Data/0483IbnAhmadSarakhsi.Mabsut-sample.txt")

#####prepare output#####
output=[]
true=0
for i in range(len(words)):
 output.append("")


# Stemmer

In [78]:
def normalize(word):
    """
    Normalize an Arabic word by replacing certain characters with their standard forms.

    Parameters:
    - word (str): The word to be normalized.

    Returns:
    - str: The normalized word.
    """
    for x in range(3):
        word = re.sub("أ", "ا", word)  # Replace "أ" with "ا"
        word = re.sub("إ", "ا", word)  # Replace "إ" with "ا"
        word = re.sub("ٱ", "ا", word)  # Replace "ٱ" with "ا"
        word = re.sub("آ", "ا", word)  # Replace "آ" with "ا"
        word = re.sub("ء", "ا", word)  # Replace "ء" with "ا"
        word = re.sub("چ", "ج", word)  # Replace "چ" with "ج"
        word = re.sub("ڤ", "ف", word)  # Replace "ڤ" with "ف"
        word = re.sub("ڥ", "ف", word)  # Replace "ڥ" with "ف"
        word = re.sub("ڢ", "ف", word)  # Replace "ڢ" with "ف"
        #word = re.sub("ة", "ه", word)  # Replace "ة" with "ه" (you might want to uncomment this line)
        word = re.sub("گ", "ك", word)  # Replace "گ" with "ك"
        word = re.sub("پ", "ب", word)  # Replace "پ" with "ب"
        word = re.sub("ڜ", "ش", word)  # Replace "ڜ" with "ش"
        word = re.sub("ژ", "ز", word)  # Replace "ژ" with "ز"
        word = re.sub("ؤ", "و", word)  # Replace "ؤ" with "و"
        word = re.sub("ژ", "ز", word)  # Replace "ژ" with "ز"
    return word

In [79]:
def Def_articles_removal(word):
    """
    Remove articles from the beginning of an Arabic word.

    Parameters:
    - word (str): The word from which articles need to be removed.

    Returns:
    - str: The word with articles removed.
    """
    # Check if the first character of the word is an article
    if word[0:1] in articles:
        word = word.replace(word[0:1], "")  # Remove the article from the word

    # Check if the first two characters of the word form an article
    if word[0:2] in articles:
        word = word.replace(word[0:2], "")  # Remove the article from the word

    # Check if the first three characters of the word form an article
    if word[0:3] in articles:
        word = word.replace(word[0:3], "")  # Remove the article from the word

    return word


In [80]:
def suffix_removal(word):
    """
    Remove common suffixes from an Arabic word.

    Parameters:
    - word (str): The word from which suffixes need to be removed.

    Returns:
    - str: The word with suffixes removed.
    """
    for x in range(len(word)):
        # Check for longer suffixes first
        if len(word) >= 8:
            if word[-5:] in s5:
                word = word.replace(word[-5:], "")
                return word
        if len(word) >= 7:
            if word[-4:] in s4:
                word = word.replace(word[-4:], "")
                return word
        if len(word) >= 6:
            if word[-3:] in s3:
                word = word.replace(word[-3:], "")
                return word
        if len(word) >= 5:
            if word[-2:] in s2:
                word = word.replace(word[-2:], "")
                return word
        if len(word) >= 4:
            if word[-1:] in s1:
                word = word.replace(word[-1:], "")
                return word
    return word


In [81]:
def remove_prefixes(word):
    """
    Remove common prefixes from an Arabic word.

    Parameters:
    - word (str): The word from which prefixes need to be removed.

    Returns:
    - str: The word with prefixes removed.
    """
    if len(word) >= 5:
        # Check for longer prefixes first
        if word[0:3] in p:
            word = word.replace(word[0:3], "")
    if len(word) >= 4:
        if word[0:2] in p:
            word = word.replace(word[0:2], "")
        if word[0:2] in p2_dash:
            word = word.replace(word[0:1], "")
        # Handle special cases for prefixes like 'و', 'ب', 'ل', 'ا'
        if word[0] == 'و':
            word = word.replace(word[0:1], "")
        if word[0] == 'ب' or word[0] == 'ل' or word[0] == 'ا':
            if word[1:] in Arabic_doc:
                word = word.replace(word[0:1], "")
    if len(word) >= 3:
        # Handle prefixes like 'ب', 'ل', 'ا', 'و'
        if word[0] == 'ب' or word[0] == 'ل' or word[0] == 'ا' or word[0] == 'و':
            if word[1:] in Arabic_doc:
                word = word.replace(word[0:1], "")
    return word


In [82]:
#apply normalization on labels, stop words and Arabic_doc
for i in range(len(labels)):
 labels[i] = araby.strip_diacritics(labels[i])
 labels[i] = araby.strip_tatweel(labels[i])
 labels[i]=normalize(labels[i])
for i in range(len(Arabic_doc)):
 Arabic_doc[i] = araby.strip_diacritics(Arabic_doc[i])
 Arabic_doc[i] = araby.strip_tatweel(Arabic_doc[i])
 Arabic_doc[i]=normalize(Arabic_doc[i])

####on words
for i in range(len(words)):
 # remove eltashkeel###
 output[i]=araby.strip_diacritics(words[i])
 # remove tatweel
 output[i] = araby.strip_tatweel(output[i])
 # Normalization
 output[i] = normalize(output[i])


#######stemmer######
for i in range(len(words)):
 # keep numbers, special charachters, stop words and the word “الله“ in output as in input
 if (isinstance(output[i], numbers.Number)) or (output[i] in special) or (output[i] in stop_words) or (
         output[i] == "الله"):
  output[i] = output[i]
 else:
  #Remove Definition articles
  output[i] = Def_articles_removal(output[i])
  # apply Remove Prefixes
  output[i] = remove_prefixes(output[i])
  # apply Remove suffixes
  output[i] = suffix_removal(output[i])
  print("word: ",words[i],"    ","output: ",output[i],"    ","stem: ",labels[i])
  if output[i]==labels[i]:
    true=true+1

word:  بِسُمِّ      output:  بسم      stem:  سم
word:  الرَّحِيمُ      output:  رحيم      stem:  رحيم
word:  اللَّهْمُ      output:  لهم      stem:  لهم
word:  يُسْرُ      output:  يسر      stem:  يسر
word:  وَأَعَنَّ      output:  عن      stem:  اعن
word:  كَرِيمِ      output:  كريم      stem:  ريم
word:  الْحَمْدُ      output:  حمد      stem:  حمد
word:  لله      output:  ه      stem:  الله
word:  أَوَضَحُ      output:  اوضح      stem:  وضح
word:  دَيْنُهُ      output:  دين      stem:  دين
word:  عَلِيُّنَا      output:  علينا      stem:  علي
word:  بِتَنْزِيلِ      output:  بتنزيل      stem:  تنزيل
word:  كُتَّابُهُ      output:  كتاب      stem:  كتاب
word:  وَأَمَدُنَا      output:  امدنا      stem:  امد
word:  بِسُنَّةِ      output:  بسنة      stem:  سنة
word:  رَسُولُهُ      output:  رسول      stem:  رسول
word:  تَمَهُّدُ      output:  تمهد      stem:  تمهد
word:  الْأَمَةُ      output:  امة      stem:  امة
word:  أُصُول      output:  اصول      stem:  اصول
word:  بِنَصِّ      out

In [83]:
print("Accuracy",true/len(words),"on ",len(words),"words.")

Accuracy 0.5240128068303095 on  937 words.
