In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_hadith(collection, hadith_count):
    hadith_texts = []

    for hadith_number in range(1, hadith_count + 1):
        hadith_url = f"https://sunnah.com/{collection}/{hadith_number}"
        response = requests.get(hadith_url)
        print("Scraping from:", hadith_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            hadith_text = soup.find("div", class_="text_details").get_text(strip=True)
            narrated_by = soup.find("div", class_="hadith_narrated").get_text(strip=True)
            hadith_texts.append((collection, hadith_number, narrated_by, hadith_text))
        else:
            print(f"Error fetching Hadith from {hadith_url}")

    return hadith_texts

base_urls = [
    ("bukhari", 97),
    ("muslim", 56),
    ("nasai", 51),
    ("abudawud", 43),
    ("tirmidhi", 49),
    ("ibnmajah", 37)
]

all_hadith_texts = []

for collection, hadith_count in base_urls:
    print(f"Scraping Hadith from {collection} collection...")
    hadith_texts = scrape_hadith(collection, hadith_count)
    all_hadith_texts.extend(hadith_texts)

df = pd.DataFrame(all_hadith_texts, columns=["Collection", "Hadith Number", "Narrated By", "Hadith Text"])


Scraping Hadith from bukhari collection...
Scraping from: https://sunnah.com/bukhari/1
Scraping from: https://sunnah.com/bukhari/2
Scraping from: https://sunnah.com/bukhari/3
Scraping from: https://sunnah.com/bukhari/4
Scraping from: https://sunnah.com/bukhari/5
Scraping from: https://sunnah.com/bukhari/6
Scraping from: https://sunnah.com/bukhari/7
Scraping from: https://sunnah.com/bukhari/8
Scraping from: https://sunnah.com/bukhari/9
Scraping from: https://sunnah.com/bukhari/10
Scraping from: https://sunnah.com/bukhari/11
Scraping from: https://sunnah.com/bukhari/12
Scraping from: https://sunnah.com/bukhari/13
Scraping from: https://sunnah.com/bukhari/14
Scraping from: https://sunnah.com/bukhari/15
Scraping from: https://sunnah.com/bukhari/16
Scraping from: https://sunnah.com/bukhari/17
Scraping from: https://sunnah.com/bukhari/18
Scraping from: https://sunnah.com/bukhari/19
Scraping from: https://sunnah.com/bukhari/20
Scraping from: https://sunnah.com/bukhari/21
Scraping from: https:

Scraping from: https://sunnah.com/nasai/31
Scraping from: https://sunnah.com/nasai/32
Scraping from: https://sunnah.com/nasai/33
Scraping from: https://sunnah.com/nasai/34
Scraping from: https://sunnah.com/nasai/35
Scraping from: https://sunnah.com/nasai/36
Scraping from: https://sunnah.com/nasai/37
Scraping from: https://sunnah.com/nasai/38
Scraping from: https://sunnah.com/nasai/39
Scraping from: https://sunnah.com/nasai/40
Scraping from: https://sunnah.com/nasai/41
Scraping from: https://sunnah.com/nasai/42
Scraping from: https://sunnah.com/nasai/43
Scraping from: https://sunnah.com/nasai/44
Scraping from: https://sunnah.com/nasai/45
Scraping from: https://sunnah.com/nasai/46
Scraping from: https://sunnah.com/nasai/47
Scraping from: https://sunnah.com/nasai/48
Scraping from: https://sunnah.com/nasai/49
Scraping from: https://sunnah.com/nasai/50
Scraping from: https://sunnah.com/nasai/51
Scraping Hadith from abudawud collection...
Scraping from: https://sunnah.com/abudawud/1
Scraping

In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_hadith_data(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')

    arabic_Hadith = []
    english_Hadith = []
    references = []

    Arabic = soup.find_all("div", class_="arabic_hadith_full arabic")
    for arabic in Arabic:
        arabic_Hadith.append(arabic.text)
        raw_arabic = soup.find("div", class_="crumbs").text
        collection = re.sub(r'Home » ', '', str(raw_arabic))

    english = soup.find_all("div", class_="text_details")
    for hadith in english:
        english_Hadith.append(hadith.text)

    References = soup.find_all("tr")
    for reference in References:
        if reference.find("a") is not None:
            references.append((reference.find("a").text))

    person, book = collection.split(" » ")
    
    return {
        'Book Name': book,
        'Collected By': person,
        'Arabic': arabic_Hadith,
        'English': english_Hadith,
        'References': references
    }

urls = [
    "https://sunnah.com/bukhari/1",
    "https://sunnah.com/muslim/1",
    "https://sunnah.com/nasai/1",
    
]

data_list = []

for url in urls:
    hadith_data = scrape_hadith_data(url)
    data_list.append(hadith_data)

df = pd.DataFrame(data_list)

# Clean the dataframe if necessary
df = df.replace(to_replace=r'\\r|\r\r|\\n|\\t|Narrated|:|\u200f.\u200f|\u200f"\u200f|\n', value='', regex=True)

print(df)


                  Book Name      Collected By  \
0                Revelation  Sahih al-Bukhari   
1         The Book of Faith      Sahih Muslim   
2  The Book of Purification   Sunan an-Nasa'i   

                                              Arabic  \
0  [حَدَّثَنَا الْحُمَيْدِيُّ عَبْدُ اللَّهِ بْنُ...   
1  [\nحَدَّثَنِي أَبُو خَيْثَمَةَ، زُهَيْرُ بْنُ ...   
2  [أَخْبَرَنَا قُتَيْبَةُ بْنُ سَعِيدٍ، قَالَ حَ...   

                                             English  \
0  [\n\n     I heard Allah's Messenger (ﷺ) saying...   
1  [Should it so happen that we come into contact...   
2  ["When any one of you wakes from sleep, let hi...   

                                          References  
0  [Sahih al-Bukhari 1, Sahih al-Bukhari 2, Sahih...  
1  [Sahih Muslim 8a, Sahih Muslim 8b, Sahih Musli...  
2  [Sunan an-Nasa'i 1, Sunan an-Nasa'i 2, Sunan a...  
