#### Read all the SurahTopics as manually extracted from Shamela

In [5]:
import requests
from bs4 import BeautifulSoup
import json

# read Topics.md as a list of strings
with open('surahTopics.md') as f:
    import re
    keep_list = [re.sub(r'#', '', line.strip()) for line in f.readlines() if line.strip()]

# If it has # then it's a supertopic (has subtopics under it)
with open('surahTopics.md') as f:
    supertopic_list = [re.sub(r'#', '', line.strip()) for line in f.readlines() if (line.strip() and "#" in line)]

#### Define function to scrape all \<li\> elements and keep only those found in surahTopics.md

In [3]:
def extract_li_elements(url):
    data = []  # List to store extracted data
    try:
        # Fetch the HTML content from the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception if the request fails
        html_content = response.content

        # Parse the HTML content
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all <li> elements within the <ul>
        li_elements = soup.find_all('li')

        # Extract and store the text content and link (if available) of each <li> element
        for li in li_elements:
            link = li.find('a', href="javascript:;")
            if link:
                sibling_link = link.find_next_sibling('a')
                if sibling_link:
                    li_text = sibling_link.get_text(strip=True)
                    li_link = sibling_link.get('href')
                else:
                    li_text = link.get_text(strip=True)
                    li_link = link.get('href')
            else:
                link = li.find('a')
                if link:
                    li_text = link.get_text(strip=True)
                    li_link = link.get('href')
                else:
                    li_text = None
                    li_link = None
                    continue
                
            if li_text not in keep_list:
                continue
            data.append({"text":li_text, "link": li_link})

        # Save the data as JSON
        with open('surahTopicsFlat.json', 'w') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

        print("Data has been saved as JSON.")

    except requests.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        
# Replace the URL below with the actual website you want to scrape
target_url = "https://shamela.ws/book/22915"
extract_li_elements(target_url)

Data has been saved as JSON.


#### Scraping Assumption for Ayah Range for Topic: Numbers within () in the longest {...}

In [None]:
import requests
import re

def get_longest_text_in_braces_with_parenthesis(link):
    # Fetch HTML content
    response = requests.get(link)
    html_text = response.text

    # Define regex pattern
    pattern = r'\{([^{}]+)\}'
    
    # Find all matches
    matches = re.findall(pattern, html_text)
    
    # Find the longest match
    longest_match = max(matches, key=len)
    
    # Find items in parenthesis in the longest match
    parenthesis_items = re.findall(r'\((.*?)\)', longest_match)
    
    return parenthesis_items

#### Get Ayah Ranges by Calling Function Above

In [4]:
from tqdm.notebook import tqdm

# read extracted_data.json
with open('surahTopicsFlat.json') as f:
    extracted_data = json.load(f)

# Define the total number of iterations for tqdm
total_iterations = len(extracted_data)

# Iterate through each item in extracted_data
for item in tqdm(extracted_data, total=total_iterations, desc="Processing"):
    # Modify each item
    item["Ayahs"] = get_longest_text_in_braces_with_parenthesis(item["link"])
    # Special case where it fails
    if "نعم الله العظمى على عباده" in item["text"]:
        item["Ayahs"] = ["٧٨", "٧٩", "٨٠"]
    # Supertopics replicate Ayahs for the first subtopic (we don't want that, they cover all subsequent topics starting with a number)
    if item["text"] in supertopic_list:
        item["Ayahs"] = []
        item["supertopic"] = True

# save modified extracted_data.json
with open('surahTopicsFlat.json', 'w') as f:
    json.dump(extracted_data, f, indent=4, ensure_ascii=False)

Processing:   0%|          | 0/20 [00:00<?, ?it/s]

#### Regroup Topics: Those Belonging to Same Surah → Same Object in JSON

In [31]:
import json
# read extracted_data_ann.json
with open('surahTopicsFlat.json') as f:
    extracted_data = json.load(f)

# find the inds where obj.test has word سورة
inds = []
for index, obj in enumerate(extracted_data):
    # Want to extract only real 'سورة' while handling that it can also appear as a topic
    if 'سورة' in obj['text'] and (len(obj['text'].split(' ')) < 6 or index == 0) or 'الانشراح' in obj['text']:
        inds.append(index)

# between each two inds and inds, group the items in an array with key being the surah at the ind
# Iterate over pairs of inds
grouped_data = {}

for i in range(len(inds) - 1):
    start_index = inds[i]
    end_index = inds[i + 1] 
    
    # Extract surah name from start index
    surah_name = extracted_data[start_index]['text']
    if "ميزة سورة" in surah_name:                       
        continue
    # Group items between start and end inds
    group = [obj for obj in extracted_data[start_index:end_index]]
    
    # Add group to the dictionary with surah name as key
    grouped_data[surah_name] = group

# Handle the last group (from the last index to the end)
surah_name = extracted_data[inds[-1]]['text']
last_group = [obj for obj in extracted_data[inds[-1]:]]
grouped_data[surah_name] = last_group
# save grouped data as surah_topics.json
with open('surahTopics.json', 'w') as f:
    json.dump(grouped_data, f, indent=4, ensure_ascii=False)

In [32]:
# print keys of grouped_data and index of each
for index, (key, value) in enumerate(grouped_data.items()):
    print(f"{index+1}: {key}")

1: سورة الفاتحة مكية وآياتها سبع نزلت بعد المدثر
2: سورة البقرة
3: سورة آل عمران
4: سورة النساء
5: سورة المائدة
6: سورة الأنعام
7: سورة الأعراف
8: سورة الأنفال
9: سورة التوبة
10: سورة يونس عليه السلام
11: سورة هود عليه السلام
12: سورة يوسف عليه السلام
13: سورة الرعد
14: سورة إبراهيم عليه السلام
15: سورة الحجر
16: سورة النحل
17: سورة الإسراء
18: سورة الكهف
19: سورة مريم
20: سورة طه
21: سورة الأنبياء
22: سورة الحج
23: سورة المؤمنون
24: سورة النور
25: سورة الفرقان
26: سورة الشعراء
27: سورة النمل
28: سورة القصص
29: سورة العنكبوت
30: سورة الروم
31: سورة لقمان
32: سورة السجدة
33: سورة الأحزاب
34: سورة سبأ
35: سورة فاطر
36: سورة يس
37: سورة الصافات
38: سورة ص
39: سورة الزمر
40: سورة غافر أو: المؤمن
41: سورة فصلت أو: السجدة
42: سورة الشورى
43: سورة الزخرف
44: سورة الدخان
45: سورة الجاثية
46: سورة الأحقاف
47: سورة محمد عليه الصلاة والسلام
48: سورة الفتح
49: سورة الحجرات
50: سورة ق
51: سورة الذاريات
52: سورة الطور
53: سورة النجم
54: سورة القمر
55: سورة الرحمن جل ذكره
56: سورة الواقعة
57: سورة ال

#### Ensure Scraping is Correct:

- Topics from each Surah have topics that cover all it's Ayahs 

- No gaps or repetitions of Ayahs across topics

In [6]:
# given an array like 1, 2, 3, 4, 6, 7, 8 return the index where contiguity broke (e.g., i=4 here)
def find_discontinuity_indices(arr):
    discontinuity_indices = []
    for i in range(1, len(arr)):
        # Convert Arabic numerals to integers for comparison
        current_num = int(arr[i])
        prev_num = int(arr[i - 1])
        
        if current_num != prev_num + 1:
            discontinuity_indices.append(i)
    
    return discontinuity_indices

In [34]:
# for each key in grouped_data, loop on all objects and concatenate their obj.Ayahs

# read surasList.json
with open('surasList.json') as f:
    suras_list = json.load(f)


for i, (key, value) in enumerate(grouped_data.items()):
    ayah_seq = []
    for topic in value:
        ayah_seq += topic['Ayahs']
    
    # First assumption mentioned above
    if int(suras_list[i]['numAyas']) != len(ayah_seq):
        print(f"Expected {suras_list[i]['numAyas']} ayahs, but got {len(ayah_seq)} for {key} at {i}")
    
    discs = find_discontinuity_indices(ayah_seq)
    # Second assumption mentioned above
    if len(discs) > 0:
        for disc in discs:
            print(f"There is a gap in ayahs for {key} at index {disc} where {ayah_seq[disc-1]} is followed by {ayah_seq[disc]}")
        print("\n")

Expected 200 ayahs, but got 197 for سورة آل عمران at 2
There is a gap in ayahs for سورة آل عمران at index 41 where ٤١ is followed by ٤٥


Expected 98 ayahs, but got 95 for سورة مريم at 18
There is a gap in ayahs for سورة مريم at index 50 where ٥٠ is followed by ٥٤


Expected 64 ayahs, but got 0 for سورة النور at 23
Expected 83 ayahs, but got 81 for سورة يس at 35
There is a gap in ayahs for سورة يس at index 66 where ٦٦ is followed by ٦٩


Expected 37 ayahs, but got 34 for سورة الجاثية at 44
There is a gap in ayahs for سورة الجاثية at index 17 where ١٧ is followed by ٢١


Expected 8 ayahs, but got 0 for سورة التين at 94
Expected 3 ayahs, but got 0 for سورة الكوثر at 107


#### Manually handling those errors:

- No need to worry about the last two (they are short Suras with no topics provided by book)

- Missing Ayah's in one of Surah Gatheya topics because Ayahs spanned two pages: missing ayahs added manually.

- Surah Al-Noor was regenerated manually by making a surahTopicNoor.md (likely above in the scraping code but repeating the scraping is tedious)

- Same for the topic in Surah Yassin as Gatheya (fixed)

- Surah Maryem had a false positive for the super topic because its name was equivalent to one supertopic in another surah (fixed)

- Surah Al-Omran same problem (fixed)

Check fixes are correct:

In [7]:
import json

with open('surasList.json') as f:
    suras_list = json.load(f)
    
with open('surahSections.json') as f:               # This is where fixed went to avoid being later overwritten
    grouped_data = json.load(f)

for i, (key, value) in enumerate(grouped_data.items()):
    ayah_seq = []
    for topic in value:
        ayah_seq += topic['Ayahs']
    
    # First assumption mentioned above
    if int(suras_list[i]['numAyas']) != len(ayah_seq):
        print(f"Expected {suras_list[i]['numAyas']} ayahs, but got {len(ayah_seq)} for {key} at {i}")
    
    discs = find_discontinuity_indices(ayah_seq)
    # Second assumption mentioned above
    if len(discs) > 0:
        for disc in discs:
            print(f"There is a gap in ayahs for {key} at index {disc} where {ayah_seq[disc-1]} is followed by {ayah_seq[disc]}")
        print("\n")

Expected 8 ayahs, but got 0 for سورة التين at 94
Expected 3 ayahs, but got 0 for سورة الكوثر at 107


Perfect

الحمدلله