# Data Cleaning 

In [30]:
import pandas as pd
import re


# Arabic Quran

In [6]:
quran_A = pd.read_csv("../data/quran-simple.txt", names=["chapter_num", "verse_num", "verse"], sep="|" )
quran_A.head()

Unnamed: 0,chapter_num,verse_num,verse
0,1,1.0,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1,2.0,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1,3.0,الرَّحْمَٰنِ الرَّحِيمِ
3,1,4.0,مَالِكِ يَوْمِ الدِّينِ
4,1,5.0,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ


In [13]:
quran_A.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6264 entries, 0 to 6263
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   chapter_num  6264 non-null   object 
 1   verse_num    6236 non-null   float64
 2   verse        6236 non-null   object 
dtypes: float64(1), object(2)
memory usage: 146.9+ KB


In [15]:
quran_A[quran_A.verse.isnull()]

Unnamed: 0,chapter_num,verse_num,verse
6236,# PLEASE DO NOT REMOVE OR CHANGE THIS COPYRIGH...,,
6237,#=============================================...,,
6238,#,,
6239,"# Tanzil Quran Text (Simple, Version 1.1)",,
6240,# Copyright (C) 2007-2022 Tanzil Project,,
6241,# License: Creative Commons Attribution 3.0,,
6242,#,,
6243,# This copy of the Quran text is carefully pr...,,
6244,# verified and continuously monitored by a gr...,,
6245,# at Tanzil Project.,,


In [17]:
quran_A.dropna(inplace=True)

In [19]:
quran_A["chapter_num"]=quran_A.chapter_num.astype(int)
quran_A["verse_num"]=quran_A.verse_num.astype(int)
quran_A.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6236 entries, 0 to 6235
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   chapter_num  6236 non-null   int64 
 1   verse_num    6236 non-null   int64 
 2   verse        6236 non-null   object
dtypes: int64(2), object(1)
memory usage: 194.9+ KB


In [74]:
quran_A.verse[1265] 

'اتَّخَذُوا أَحْبَارَهُمْ وَرُهْبَانَهُمْ أَرْبَابًا مِّن دُونِ اللَّهِ وَالْمَسِيحَ ابْنَ مَرْيَمَ وَمَا أُمِرُوا إِلَّا لِيَعْبُدُوا إِلَٰهًا وَاحِدًا لَّا إِلَٰهَ إِلَّا هُوَ سُبْحَانَهُ عَمَّا يُشْرِكُونَ'

In [77]:
def A_cleaning(aya):
    #aya = re.sub("ًا","",aya)
    aya = re.sub("[ًًٌٌٍٍَُِّّْٰٓٓ]","",aya)
    aya = re.sub("[آ]","ا",aya)
    return aya

A_cleaning(quran_A.verse[1265])

'اتخذوا أحبارهم ورهبانهم أربابا من دون الله والمسيح ابن مريم وما أمروا إلا ليعبدوا إلها واحدا لا إله إلا هو سبحانه عما يشركون'

In [89]:
quran_A[quran_A.verse_num == 1] # 

Unnamed: 0,chapter_num,verse_num,verse
0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
7,2,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ الم
293,3,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ الم
493,4,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ يَا أَي...
669,5,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ يَا أَي...
...,...,...,...
6213,110,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ إِذَا ج...
6216,111,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ تَبَّتْ...
6221,112,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ قُلْ هُ...
6225,113,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ قُلْ أَ...


- some scholars say that "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ" is a part of each chapter, so it shoul be numbered.
- other scholars say that it is not a part of each chapter, so it should not be counted.(this is why this verse is not at the first of each chapter in the translated version)
- all scholars agree that it is a part of the first chapter (الفاتحة) and it is not a part of the #th chapter (التوبة).


# English quran

In [32]:
quran_E = pd.read_csv("../data/english_saheeh_v1.1.0-csv.1.csv")
quran_E.head()

Unnamed: 0,id,sura,aya,translation,footnotes
0,1,1,1,"(1) In the name of Allāh,[2] the Entirely Merc...",[2]- Allāh is a proper name belonging only to ...
1,2,1,2,"(2) [All] praise is [due] to Allāh, Lord[4] of...",[4]- When referring to Allāh (subḥānahu wa taʿ...
2,3,1,3,"(3) The Entirely Merciful, the Especially Merc...",
3,4,1,4,(4) Sovereign of the Day of Recompense.[5],"[5]- i.e., repayment and compensation for what..."
4,5,1,5,(5) It is You we worship and You we ask for help.,


In [33]:
quran_E.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6236 entries, 0 to 6235
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           6236 non-null   int64 
 1   sura         6236 non-null   int64 
 2   aya          6236 non-null   int64 
 3   translation  6236 non-null   object
 4   footnotes    1612 non-null   object
dtypes: int64(3), object(2)
memory usage: 243.7+ KB


In [34]:
# we are not intersted in the footnotes and id is just an index.
quran_E.drop(columns=["footnotes", "id"], inplace=True)

In [83]:
quran_E.translation[1265]

'(31) They have taken their scholars and monks as lords besides Allāh,[467] and [also] the Messiah, the son of Mary.[468] And they were not commanded except to worship one God; there is no deity except Him. Exalted is He above whatever they associate with Him.'

In [88]:
def E_clean(verse):
    verse = verse.lower()
    
    verse = re.sub("\[.*?\]","",verse)
    verse = re.sub("[(0-9)]","",verse)
    verse = verse.strip()
    return(verse)

E_clean(quran_E.translation[1265])

'they have taken their scholars and monks as lords besides allāh, and  the messiah, the son of mary. and they were not commanded except to worship one god; there is no deity except him. exalted is he above whatever they associate with him.'