# Toward an ARABIC Stop-Words List Generation
One of the most important preprocessing techniques is the removal of functional words which affects the performance of text mining tasks. In this [paper](https://www.researchgate.net/publication/306364790_Toward_an_ARABIC_Stop-Words_List_Generation), a statistical approach is presented to extract Arabic stop-words list. The extracted list was compared to a general list. The comparison yield an improvement in an ANN based classifier using the generated stop-words list over the general list.


stop words are words which are filtered out before or after processing of natural language data.

In [566]:
import pandas as pd
import glob

import unicodedata as ud

import numpy as np
import pickle
import matplotlib.pyplot as plt
from collections import Counter

from sklearn import feature_extraction, model_selection

from scipy.stats import entropy
from math import log, e

### Explore Data

In [141]:
#get all csv files and concatenate them into single dataframe
path = r'../data/'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)
data.head(n=-1)

Unnamed: 0,article
0,بمجرد أن تكون لك عملية مالية أو غيرها داخل أغل...
1,قبل فترة ليست بالقصيرة صدرت قرارات بخصوص فتح ا...
2,هناك مثل شعبي عقلاني يقول (راحت السكرة وجت الف...
3,انتهت قبل عدة أيام مناسك الحج التي أداها الحجا...
4,إن لانتشار الأشجار في المدن مزايا عديدة، منها ...
...,...
23792,تعيد شبكات التواصل فكرة الساحة العامة الرئيسية...
23793,كتب داليبور روهاك، باحث زميل في معهد اميركان ا...
23794,مضى الآن على مقاطعة الدول الأربع لقطر ما يزيد ...
23795,ما بين البَصْرة ونجد، في عشرينيات القرن الماضي...


In [142]:
#remove NaN
data = data[data['article'].notna()]

In [143]:
#remove punctuation
for i in range(data.shape[0]):
    data.iloc[i][0] = ''.join(c for c in data.iloc[i][0] if not ud.category(c).startswith('P'))
data

Unnamed: 0,article
0,بمجرد أن تكون لك عملية مالية أو غيرها داخل أغل...
1,قبل فترة ليست بالقصيرة صدرت قرارات بخصوص فتح ا...
2,هناك مثل شعبي عقلاني يقول راحت السكرة وجت الفك...
3,انتهت قبل عدة أيام مناسك الحج التي أداها الحجا...
4,إن لانتشار الأشجار في المدن مزايا عديدة منها ز...
...,...
23793,كتب داليبور روهاك باحث زميل في معهد اميركان ان...
23794,مضى الآن على مقاطعة الدول الأربع لقطر ما يزيد ...
23795,ما بين البَصْرة ونجد في عشرينيات القرن الماضي ...
23796,في محاضرته واجب الشباب المسلم اليوم التي ألقاه...


In [144]:
#split sentences to words
words = pd.DataFrame()
for i in range(data.shape[0]):
    words = words.append(data.iloc[i][0].split())
words

Unnamed: 0,0
0,بمجرد
1,أن
2,تكون
3,لك
4,عملية
...,...
339,الفساد
340,جعجعة
341,ولا
342,يرى


### Step 1: Word Frequency Calculation

In [409]:
wordsFrequency = pd.DataFrame(Counter(words[0]).most_common(50), columns=['word', 'frequency'])
wordsFrequency

Unnamed: 0,word,frequency
0,في,66855
1,من,49154
2,على,31158
3,أن,26341
4,إلى,20036
5,التي,14635
6,لا,11904
7,ما,11833
8,عن,11731
9,الذي,10604


#### Cleaning

In [410]:
updated_wordsFrequency = wordsFrequency.drop(16)

#في - وفي - فى
updated_wordsFrequency.at[0, 'frequency'] = wordsFrequency.at[21,'frequency'] + wordsFrequency.at[0,'frequency']

updated_wordsFrequency.at[0, 'frequency'] = wordsFrequency.at[49,'frequency'] + wordsFrequency.at[0,'frequency']

#لا - ولا
updated_wordsFrequency.at[7, 'frequency'] = wordsFrequency.at[23,'frequency'] + wordsFrequency.at[7,'frequency']

#من - ومن
updated_wordsFrequency.at[1, 'frequency'] = wordsFrequency.at[46,'frequency'] + wordsFrequency.at[1,'frequency']

#هو - وهو
updated_wordsFrequency.at[18, 'frequency'] = wordsFrequency.at[33,'frequency'] + wordsFrequency.at[18,'frequency']

updated_wordsFrequency = updated_wordsFrequency.drop(list(range(21, 50)))

updated_wordsFrequency

Unnamed: 0,word,frequency
0,في,69382
1,من,51729
2,على,31158
3,أن,26341
4,إلى,20036
5,التي,14635
6,لا,11904
7,ما,15759
8,عن,11731
9,الذي,10604


In [422]:
#reset indeces
updated_wordsFrequency.reset_index(drop=True, inplace=True)
wordsFrequency = updated_wordsFrequency
wordsFrequency

Unnamed: 0,word,frequency
0,في,69382
1,من,51729
2,على,31158
3,أن,26341
4,إلى,20036
5,التي,14635
6,لا,11904
7,ما,15759
8,عن,11731
9,الذي,10604


### Step 2: Mean and variance Calculation

In [423]:
FreqOfWordInDocs = []
numberOfDistinctWordsInDocs = []
propOfWordInDocs = []

In [424]:
path = r'../data/'
all_files = glob.glob(path + "/*.csv")

li = []
file_number = 0

for filename in all_files:
    data = pd.read_csv(filename, index_col=None, header=0)
    
    #remove NaN
    data = data[data['article'].notna()]

    #remove punctuation
    for i in range(data.shape[0]):
        data.iloc[i][0] = ''.join(c for c in data.iloc[i][0] if not ud.category(c).startswith('P'))

    #split sentences to words
    words = pd.DataFrame()
    for i in range(data.shape[0]):
        words = words.append(data.iloc[i][0].split())

    #add number of distinct words in each doc to a list
    numberOfDistinctWordsInDocs.append(words[0].unique().shape[0])

    #word frequency in each Doc
    FreqOfWord = pd.DataFrame(Counter(words[0]).most_common(30), columns=['word', 'frequency'])
    wordsFrequencyDict = dict()
    for i in range(FreqOfWord.shape[0]):
             wordsFrequencyDict[FreqOfWord.word[i]] = FreqOfWord.frequency[i]

    #add number of frequency of most common words in each doc to a list
    FreqOfWordInDocs.append(wordsFrequencyDict) 

    #probability of each 30 common word in each doc
    wordsProbabilityDict = dict()
    for i in wordsFrequencyDict :
             wordsProbabilityDict[i] = wordsFrequencyDict[i]/numberOfDistinctWordsInDocs[file_number]

    #add probability of most common words in each doc to a list
    propOfWordInDocs.append(dict(sorted(wordsProbabilityDict.items(), key=lambda item: item[1], reverse=True)))
    
    file_number+=1

In [425]:
FreqOfWordInDocs

[{'في': 35032,
  'من': 29276,
  'على': 16404,
  'أن': 14138,
  'التي': 9000,
  'إلى': 8552,
  'أو': 7259,
  'لا': 6940,
  'ما': 6604,
  'عن': 6220,
  'مع': 4755,
  'الذي': 4708,
  'هذا': 4697,
  'هذه': 4329,
  'كل': 4221,
  'ذلك': 3281,
  'المملكة': 3108,
  'هو': 3059,
  'كان': 2732,
  'بين': 2703,
  'لم': 2701,
  'الله': 2483,
  'السعودية': 2413,
  'ولا': 2258,
  'كما': 2126,
  'العالم': 2112,
  'تلك': 2105,
  'حتى': 2087,
  'هي': 1913,
  'كانت': 1884},
 {'في': 38804,
  'من': 32983,
  'على': 18712,
  'أن': 14729,
  'التي': 9843,
  'إلى': 9059,
  'أو': 7359,
  'ما': 7328,
  'عن': 7058,
  'لا': 6843,
  'هذا': 5412,
  'مع': 5362,
  'الذي': 5121,
  'هذه': 4959,
  'كل': 4285,
  'المملكة': 3627,
  'ذلك': 3492,
  'هو': 3219,
  'العالم': 3112,
  'كان': 3105,
  'بين': 2922,
  'الله': 2867,
  'لم': 2835,
  'بعد': 2474,
  'كما': 2471,
  'خلال': 2394,
  'ولا': 2368,
  'حتى': 2328,
  'السعودية': 2235,
  'كانت': 2105},
 {'في': 36550,
  'من': 31194,
  'على': 19068,
  'أن': 17890,
  'إلى': 9978,
  'ا

In [426]:
numberOfDistinctWordsInDocs

[139729, 144065, 135548, 109931, 194022, 87428, 194432, 188122]

In [427]:
propOfWordInDocs

[{'في': 0.25071388187133664,
  'من': 0.20951985629325337,
  'على': 0.11739867887124363,
  'أن': 0.1011815729018314,
  'التي': 0.06441039440631507,
  'إلى': 0.06120418810697851,
  'أو': 0.05195056144393791,
  'لا': 0.04966757079775852,
  'ما': 0.04726291607325609,
  'عن': 0.04451473924525331,
  'مع': 0.034030158378003134,
  'الذي': 0.03369379298499238,
  'هذا': 0.03361506916960688,
  'هذه': 0.030981399709437554,
  'كل': 0.03020847497656177,
  'ذلك': 0.02348116711634664,
  'المملكة': 0.022243056201647474,
  'هو': 0.021892377387657536,
  'كان': 0.019552133057561424,
  'بين': 0.019344588453363298,
  'لم': 0.019330275032384114,
  'الله': 0.017770112145653372,
  'السعودية': 0.017269142411382034,
  'ولا': 0.016159852285495494,
  'كما': 0.01521516650086954,
  'العالم': 0.015114972554015272,
  'تلك': 0.015064875580588138,
  'حتى': 0.014936054791775509,
  'هي': 0.01369078716658675,
  'كانت': 0.013483242562388623},
 {'في': 0.26935064033595946,
  'من': 0.22894526776108007,
  'على': 0.1298858154305

In [532]:
df_propOfWordInDocs = pd.DataFrame.from_dict(propOfWordInDocs)
df_propOfWordInDocs.keys()

Index(['في', 'من', 'على', 'أن', 'التي', 'إلى', 'أو', 'لا', 'ما', 'عن', 'مع',
       'الذي', 'هذا', 'هذه', 'كل', 'ذلك', 'المملكة', 'هو', 'كان', 'بين', 'لم',
       'الله', 'السعودية', 'ولا', 'كما', 'العالم', 'تلك', 'حتى', 'هي', 'كانت',
       'بعد', 'خلال', 'فى', 'القدم', 'الأندية', 'قبل', 'الكرة', 'كرة',
       'الفريق', 'الرئيس', 'غير', 'إيران', 'أي', 'إن', 'قد', 'كورونا', 'عام',
       'بي', 'لكن', 'المتحدة', 'لبنان'],
      dtype='object')

In [533]:
key_words = wordsFrequency.word.tolist()
df_propOfWordInDocs = df_propOfWordInDocs[key_words]
df_propOfWordInDocs

Unnamed: 0,في,من,على,أن,إلى,التي,لا,ما,عن,الذي,مع,أو,هذا,كان,هذه,لم,ذلك,هو,بين,بعد
0,0.250714,0.20952,0.117399,0.101182,0.061204,0.06441,0.049668,0.047263,0.044515,0.033694,0.03403,0.051951,0.033615,0.019552,0.030981,0.01933,0.023481,0.021892,0.019345,
1,0.269351,0.228945,0.129886,0.102239,0.062881,0.068323,0.047499,0.050866,0.048992,0.035546,0.037219,0.051081,0.037566,0.021553,0.034422,0.019679,0.024239,0.022344,0.020283,0.017173
2,0.269646,0.230132,0.140673,0.131983,0.073612,0.060827,0.05871,0.054674,0.059691,0.044265,0.047607,0.041159,0.037182,0.033022,0.026765,0.028698,0.023778,0.024663,0.023357,0.024619
3,0.237331,0.228925,0.123077,0.133466,0.059046,0.053852,0.056026,0.050086,0.058628,0.039861,0.048712,0.039543,0.038524,0.033721,0.025289,0.028918,0.020386,0.025279,0.022978,0.026498
4,0.330555,0.265284,0.16259,0.132892,0.101504,0.075095,0.060462,0.063214,0.061972,0.048067,0.0527,0.050999,0.040289,0.036233,0.035991,0.031935,0.029507,0.029646,0.031667,0.023966
5,0.288626,0.194755,0.121334,0.089045,0.069314,0.052969,0.03102,0.033925,0.04812,0.032713,0.032713,0.025232,0.026296,0.020703,0.023482,0.018815,0.01933,0.014618,0.02084,0.023333
6,0.330331,0.259119,0.160431,0.127263,0.103121,0.071146,0.059872,0.060684,0.062567,0.049853,0.049812,0.047312,0.041104,0.034506,0.035617,0.03252,0.031265,0.029681,0.029712,0.02495
7,0.355381,0.261288,0.165627,0.140021,0.106505,0.077795,0.063278,0.062901,0.062358,0.056368,0.052285,0.04866,0.041601,0.040346,0.035716,0.034988,0.032043,0.031559,0.031145,0.025016


In [534]:
df_propOfWordInDocs['بعد'] = df_propOfWordInDocs['بعد'].fillna(df_propOfWordInDocs['بعد'].mean())
df_propOfWordInDocs

Unnamed: 0,في,من,على,أن,إلى,التي,لا,ما,عن,الذي,مع,أو,هذا,كان,هذه,لم,ذلك,هو,بين,بعد
0,0.250714,0.20952,0.117399,0.101182,0.061204,0.06441,0.049668,0.047263,0.044515,0.033694,0.03403,0.051951,0.033615,0.019552,0.030981,0.01933,0.023481,0.021892,0.019345,0.023651
1,0.269351,0.228945,0.129886,0.102239,0.062881,0.068323,0.047499,0.050866,0.048992,0.035546,0.037219,0.051081,0.037566,0.021553,0.034422,0.019679,0.024239,0.022344,0.020283,0.017173
2,0.269646,0.230132,0.140673,0.131983,0.073612,0.060827,0.05871,0.054674,0.059691,0.044265,0.047607,0.041159,0.037182,0.033022,0.026765,0.028698,0.023778,0.024663,0.023357,0.024619
3,0.237331,0.228925,0.123077,0.133466,0.059046,0.053852,0.056026,0.050086,0.058628,0.039861,0.048712,0.039543,0.038524,0.033721,0.025289,0.028918,0.020386,0.025279,0.022978,0.026498
4,0.330555,0.265284,0.16259,0.132892,0.101504,0.075095,0.060462,0.063214,0.061972,0.048067,0.0527,0.050999,0.040289,0.036233,0.035991,0.031935,0.029507,0.029646,0.031667,0.023966
5,0.288626,0.194755,0.121334,0.089045,0.069314,0.052969,0.03102,0.033925,0.04812,0.032713,0.032713,0.025232,0.026296,0.020703,0.023482,0.018815,0.01933,0.014618,0.02084,0.023333
6,0.330331,0.259119,0.160431,0.127263,0.103121,0.071146,0.059872,0.060684,0.062567,0.049853,0.049812,0.047312,0.041104,0.034506,0.035617,0.03252,0.031265,0.029681,0.029712,0.02495
7,0.355381,0.261288,0.165627,0.140021,0.106505,0.077795,0.063278,0.062901,0.062358,0.056368,0.052285,0.04866,0.041601,0.040346,0.035716,0.034988,0.032043,0.031559,0.031145,0.025016


#### Calculate Mean of Probability (MP)

In [535]:
#calculate mean the automatic way
series_MP = df_propOfWordInDocs.mean()
series_MP

في      0.291492
من      0.234746
على     0.140127
أن      0.119761
إلى     0.079649
التي    0.065552
لا      0.053317
ما      0.052952
عن      0.055855
الذي    0.042546
مع      0.044385
أو      0.044492
هذا     0.037022
كان     0.029954
هذه     0.031033
لم      0.026860
ذلك     0.025504
هو      0.024960
بين     0.024916
بعد     0.023651
dtype: float64

In [536]:
MP = dict()

In [537]:
#calculate mean the manual way
for i in range(wordsFrequency.shape[0]):
    key = wordsFrequency.word[i]
    MP[key] = 0
    for j in range(file_number):
        if key in propOfWordInDocs[j]:
            MP[key] += propOfWordInDocs[j][key]
    MP[key] /= file_number

In [538]:
MP

{'في': 0.2914919081470928,
 'من': 0.23474608589267099,
 'على': 0.14012712744779,
 'أن': 0.11976114186261148,
 'إلى': 0.07964854063417309,
 'التي': 0.06555220966082144,
 'لا': 0.05331684867175042,
 'ما': 0.052951728974971676,
 'عن': 0.055855313814814914,
 'الذي': 0.04254578811913884,
 'مع': 0.04438480187423934,
 'أو': 0.04449214418543106,
 'هذا': 0.03702227354525748,
 'كان': 0.02995438986022642,
 'هذه': 0.031032883914035213,
 'لم': 0.026860457312116844,
 'ذلك': 0.025503614441094835,
 'هو': 0.024960412798031177,
 'بين': 0.024915697099442014,
 'بعد': 0.02069436842868455}

#### Calculate Variance of Probability (VP)

In [539]:
#calculate variance the automatic way
series_VP = df_propOfWordInDocs.var().sort_values(ascending=False)
series_VP

في      0.001813
من      0.000651
إلى     0.000420
على     0.000405
أن      0.000367
لا      0.000111
ما      0.000096
التي    0.000086
أو      0.000082
الذي    0.000073
مع      0.000069
كان     0.000065
عن      0.000055
لم      0.000043
هو      0.000030
هذه     0.000027
بين     0.000026
هذا     0.000025
ذلك     0.000024
بعد     0.000008
dtype: float64

In [540]:
totalWordsInAllDocs = 0
for i in range(wordsFrequency.shape[0]):
    totalWordsInAllDocs += wordsFrequency.frequency[i]
totalWordsInAllDocs

336577

### Step 3: Entropy Calculation

In [541]:
len(key_words)

20

In [542]:
ENTRPOY = dict()

In [567]:
for key in key_words:
    p = df_propOfWordInDocs[key].values
    ENTRPOY[key] = entropy(p)

#sorting
ENTRPOY = dict(sorted(ENTRPOY.items(), key=lambda item: item[1], reverse=True))
ENTRPOY

{'من': 2.074227530652407,
 'بعد': 2.0728673198153986,
 'عن': 2.071579462039245,
 'هذا': 2.0708117491769467,
 'التي': 2.070628939732978,
 'على': 2.07047974729388,
 'في': 2.0701720685200233,
 'أن': 2.06784827442543,
 'هذه': 2.0669769157759066,
 'ذلك': 2.0636482119406425,
 'ما': 2.0635220528874534,
 'مع': 2.0635181529496878,
 'الذي': 2.0620127776264354,
 'بين': 2.0613156975526845,
 'لا': 2.0606656213290266,
 'أو': 2.059347829636974,
 'هو': 2.056887436960875,
 'لم': 2.052230344417828,
 'إلى': 2.051033937258374,
 'كان': 2.046384039994275}

### Step 4: Aggregation

#### apply Borda ranking

In [577]:
n = 20
RANK = dict()

In [578]:
for key in series_MP.keys():
    RANK[key]=n
    n-=1
RANK

{'في': 20,
 'من': 19,
 'على': 18,
 'أن': 17,
 'إلى': 16,
 'التي': 15,
 'لا': 14,
 'ما': 13,
 'عن': 12,
 'الذي': 11,
 'مع': 10,
 'أو': 9,
 'هذا': 8,
 'كان': 7,
 'هذه': 6,
 'لم': 5,
 'ذلك': 4,
 'هو': 3,
 'بين': 2,
 'بعد': 1}

In [579]:
n = 20
for key in series_VP.keys():
    RANK[key]+=n
    n-=1
RANK

{'في': 40,
 'من': 38,
 'على': 35,
 'أن': 33,
 'إلى': 34,
 'التي': 28,
 'لا': 29,
 'ما': 27,
 'عن': 20,
 'الذي': 22,
 'مع': 20,
 'أو': 21,
 'هذا': 11,
 'كان': 16,
 'هذه': 11,
 'لم': 12,
 'ذلك': 6,
 'هو': 9,
 'بين': 6,
 'بعد': 2}

In [580]:
n = 20
for key in ENTRPOY.keys():
    RANK[key]+=n
    n-=1
RANK

{'في': 54,
 'من': 58,
 'على': 50,
 'أن': 46,
 'إلى': 36,
 'التي': 44,
 'لا': 35,
 'ما': 37,
 'عن': 38,
 'الذي': 30,
 'مع': 29,
 'أو': 26,
 'هذا': 28,
 'كان': 17,
 'هذه': 23,
 'لم': 15,
 'ذلك': 17,
 'هو': 13,
 'بين': 13,
 'بعد': 21}

In [581]:
#sorting
RANK = dict(sorted(RANK.items(), key=lambda item: item[1], reverse=True))
RANK

{'من': 58,
 'في': 54,
 'على': 50,
 'أن': 46,
 'التي': 44,
 'عن': 38,
 'ما': 37,
 'إلى': 36,
 'لا': 35,
 'الذي': 30,
 'مع': 29,
 'هذا': 28,
 'أو': 26,
 'هذه': 23,
 'بعد': 21,
 'كان': 17,
 'ذلك': 17,
 'لم': 15,
 'هو': 13,
 'بين': 13}

In [591]:
RANK_keys = list(RANK.keys())

In [597]:
print("top 20 Arabic words after applying \"Borda\" ranking:")
RANK_keys

top 20 Arabic words after applying "Borda" ranking:


['من',
 'في',
 'على',
 'أن',
 'التي',
 'عن',
 'ما',
 'إلى',
 'لا',
 'الذي',
 'مع',
 'هذا',
 'أو',
 'هذه',
 'بعد',
 'كان',
 'ذلك',
 'لم',
 'هو',
 'بين']