Now let's read the tafsir json and make some transformations for readability.

Remove extra characters (e.g., dots) between braces so no new lines are inserted there later:

In [2]:
def remove_chars_between_braces(text):
    inside_braces = False
    result = ''
    for char in text:
        if char == '{':
            inside_braces = True
        elif char == '}':
            inside_braces = False
        elif inside_braces and (char in ['.', ".", "_"]) or (char.isspace() and char !=" "):
            continue
        result += char
    return result

# Example usage:
arabic_string = "مرحبا. هذا هو النص العربي {مع. النقاط. ولكن. لا نريدها. هنا    }."
cleaned_string = remove_chars_between_braces(arabic_string)
print(cleaned_string)

مرحبا. هذا هو النص العربي {مع النقاط ولكن لا نريدها هنا    }.


Make the 12 (as in the if conditions) words before the colon italic and bold if they are at the start of the sentence. What italic will do is change the color (has a custom style in the code) but not make actual text italic

In [3]:
import re

def make_before_colons_italic(input_string):
    lines = input_string.split('<br><br>')
    processed_lines = []

    for line in lines:
        parts = line.split(':')
        if len(parts) > 1:
            before_colon = parts[0]
            words_before_colon = len(before_colon.split())
            if words_before_colon <= 12:
                line = '<b><i>' + before_colon.strip() + ':</i></b>' + ':'.join(parts[1:])
        processed_lines.append(line)

    return '<br><br>'.join(processed_lines)

Extra `<br><br>` tags (white space) will appear at the end of the tafsir after processing so this fixes it

In [4]:
def remove_final_br_tags(string):
    if string.endswith('<br><br>'):
        return string[:-8]  # Removing the last 8 characters ("<br><br>")
    else:
        return string

Can remove tashkeel for stemming (not really used here)

In [5]:
def remove_tashkeel(text):
    return re.sub(u'[\u064e\u064f\u0650\u0651\u0652\u064c\u064b\u064d\u0640\ufc62]','',text)

Special words that should be colored by wrapping in an italic tag

In [12]:
special_list = [
"سبحانه",
"المراد",
"الله",
"تعالى",
"سبحانه",
"صلى",
"قال",
"قوله",
"عليه",
"الآية",
"ابن",
"القرآن",
"والمراد",
"الرسول",
"وقوله",
"فقال",
"الكريم",
"موسى",
"النبى",
"وسلم",
"القيامة",
"الإِمام",
"بالله",
"سيدنا",
"محمد",
"آله",
"وعلى",
"وصحبه",
"وسلم",
"وصلى",
"ﷺ",
"السلام"
]

In [16]:
import json
import re
from tqdm.notebook import tqdm

# read tafsirs.json
with open('tafsirs.json') as f:
    tafsirs = json.load(f)
    
with open ('../suras.json') as f:
    suras = json.load(f) 

# Iterate through each list with tqdm
for i, surah in tqdm(enumerate(tafsirs), total=len(tafsirs)):
    for j, tafsir in enumerate(surah):
        tafsirWords = tafsir['text'].split(" ")
        
        # Any word just before : should be italic (i.e., be colored as we mentioned above)
        for k, tafsirWord in enumerate(tafsirWords[:-1]):
            if tafsirWords[k+1] == ":" :
                tafsir['text'] = re.sub(r'\b{}\b'.format(re.escape(tafsirWords[k])), f'<i>{tafsirWords[k]}</i>', tafsir['text'])
        
        


        # if you ever find {...} surround it with <b></b> to make it bold
        tafsir['text'] = tafsir['text'].replace('{', '{ <b>')
        tafsir['text'] = tafsir['text'].replace('}', '</b> }')
        tafsir['text'] = tafsir['text'].replace('(', '{ <b>')
        tafsir['text'] = tafsir['text'].replace(')', '</b> }')
        tafsir['text'] =  tafsir['text'].replace('<b> ', '<b>')
        tafsir['text'] =  tafsir['text'].replace(' </b>', '</b>')

        # remove any special characters within {.*} for purpose mentioned above
        tafsir['text'] = remove_chars_between_braces(tafsir['text'])

        # Add new lines after '. ' or '...' unless it's followed by a quote
        pattern = r'\.(?!")(?! ")(?!\.)(?! \.)'
        tafsir['text'] = re.sub(pattern, '.<br><br>', tafsir['text'])
        pattern = r'\.\.\.(?!")(?!\.)'
        tafsir['text'] = re.sub(pattern, '...<br><br>', tafsir['text'])

        # Remove spaces preceding a dot in the 'text' field
        tafsir['text'] = tafsir['text'].replace(' .', '.')
        
        # if * is not preceded by \n then add \n
        pattern = r'(?<!<br>)\*'
        tafsir['text'] = re.sub(pattern, r'<br>*', tafsir['text'])
        
        # if tafsir['text'] starts with <br> then remove it
        tafsir['text'] = tafsir['text'].lstrip('<br>')
        
        # replace * with nothing
        tafsir['text'] = tafsir['text'].replace('*', '')

        # if • is not  preceded by \n then add \n
        pattern = r'(?<!<br>)•'
        tafsir['text'] = re.sub(pattern, r'<br>•', tafsir['text'])
        
        # Remove floating dots
        tafsir['text'] = tafsir['text'].replace('<br>.<br>', '')

        # Make words before colon colored
        tafsir['text'] = make_before_colons_italic(tafsir['text'])

        # Remove extra new lines at the end
        tafsir['text'] =  remove_final_br_tags(tafsir['text'])
        
        # special words should be italic (i.e., be colored as we mentioned above)
        for word in special_list:
            pattern = f'(?<![^\W\d_])({word})(?![^\W\d_])'  # Match the word only if not preceded or followed by a non-alphanumeric character
            tafsir['text'] = re.sub(pattern, r'<i>\1</i>', tafsir['text'])
        
# save as surahTafsir
with open('surahTafsirs.json', 'w') as f:
    json.dump(tafsirs, f, ensure_ascii=False, indent=4) 

  0%|          | 0/114 [00:00<?, ?it/s]

Test parsing and reformatting:

In [8]:
from IPython.display import display_html

display_html(
    f"""<div style="direction:rtl">
    {tafsirs[17][19]['text']}
    </div>
    """
    , raw=True)

In [9]:
tafsirs[14][0]['text']

'<b><i>* سورة الحجر مكية * مِن مَّقاصِدِ السُّورَةِ:</i></b>إنذار المكذبين بالعقاب من خلال عرض مشاهد المهلكين، تحذيرًا للمخاطبين وتثبيتًا للمؤمنين.<br><br><b><i>* التَّفْسِيرُ:</i></b>﴿الر﴾ تقدم الكلام على نظائرها في بداية سورة البقرة.<br><br> هذه الآيات رفيعة الشأن الدالة على أنها منزلة من عند <i>الله</i> هي آيات قرآن مُوضِّح للتوحيد والشرائع.'

Get most frequest words to then add in `special_words` list above.

In [10]:
from collections import Counter
from tqdm import tqdm

# Concatenate all text
all_text = ' '.join([tafsir['text'] for surah in tafsirs for tafsir in surah])

# Tokenize the text into words
words = all_text.split()

# Count the frequency of each word
word_counts = Counter(words)

# Sort the words based on their frequencies
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Select the top 200 words
top_200_words = sorted_words[:200]

# Print the top 200 words
for i, (word, freq) in tqdm(enumerate(top_200_words), total=200):
    print(f"{i+1}. {word}")

100%|██████████| 200/200 [00:00<00:00, 364880.73it/s]

1. من
2. <i>الله</i>
3. في
4. -
5. على
6. لا
7. ما
8. أن
9. إلى
10. أيها
11. عن
12. ولا
13. الذي
14. إن
15. الذين
16. به
17. <i>عليه</i>
18. بما
19. <i>الرسول</i>
20. إلا
21. أو
22. ذلك
23. لهم
24. يوم
25. <i>الله</i>،
26. هو
27. التي
28. وما
29. فوائد
30. له
31. كان
32. الآيات:</i></b>•
33. يا
34. كل
35. هذا
36. فلا
37. بعد
38. الناس
39. <i>بالله</i>
40. لكم
41. ومن
42. عند
43. وهو
44. فيه
45. الدنيا
46. لم
47. إذا
48. فيها
49. حتى
50. ثم
51. عليهم
52. هم
53. هؤلاء
54. مع
55. الأرض
56. هذه
57. أهل
58. <i>القيامة</i>
59. <i><i>سبحانه</i></i>
60. <i>القرآن</i>
61. حين
62. بل
63. به،
64. عذاب
65. مما
66. دون
67. وإن
68. والله
69. لما
70. كانوا
71. <i>موسى</i>
72. الحق
73. فإن
74. عليكم
75. شيء
76. بسبب
77. بها
78. غير
79. كما
80. المؤمنين
81. العذاب
82. لله
83. <i>تعالى</i>
84. مثل
85. <b><i><i>قال</i>
86. ﷺ
87. بين
88. الإيمان
89. الكفر
90. فيما
91. منه
92. وهم
93. وحده
94. لمن
95. منهم
96. فهو
97. إليه
98. قبل
99. أنه
100. فقد
101. كنتم
102. ليس
103. آمنوا
104. يخفى
105. الكفار
106. سب


