In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import time
import json
import re


In [2]:
# Set up WebDriver (e.g., Chrome)
driver = webdriver.Chrome()

# Open the website
driver.get("http://www.sanskrit-linguistics.org/dcs/index.php?contents=texte")  

# Wait for the text box containing the book names to load
wait = WebDriverWait(driver, 20)

In [13]:
def process_chapter(book_info, chapter_container, chapter_name):
    chapter_option = chapter_container.find_elements(By.XPATH, f".//option[contains(text(), '{chapter_name}')]")
    chapter_option[0].click()
    if chapter_name == 'ĀVDīp, 0':
        return 1
    attempts_remaining = 5
    while attempts_remaining:
        try:
            
            time.sleep(5)
            print(chapter_name)
            sentence_container = wait.until(EC.presence_of_element_located((By.ID, "sentences")))
            wait.until(lambda driver: len(driver.find_elements(By.XPATH, '//*[@id="sentences"]/div[contains(@class, "sentence_div") or contains(@class, "sentence_analysis_div")]')) > 0)
            combined_list = [element for element in driver.find_elements(By.XPATH, '//*[@id="sentences"]/div[contains(@class, "sentence_div") or contains(@class, "sentence_analysis_div")]')]
            working_pair_list = []
            pair = []
            for element in combined_list:
                if(element.get_attribute("class") == "sentence_div" and len(pair)>0):
                    working_pair_list.append(pair)
                    pair = []            
                pair.append(element)
            if(len(pair)>0):
                working_pair_list.append(pair)
            for _pair in working_pair_list:
                element_dict = dict()
                sentenceHTML = _pair[0].get_attribute("innerHTML")
                sentence = sentenceHTML[sentenceHTML.find('\\') + 2 : sentenceHTML.find('/')]
                a_lists = _pair[1].find_elements(By.CLASS_NAME, 'text-lemma-link')
                a_texts = [a.get_attribute("innerHTML") for a in a_lists]
                element_dict['text'] = sentence
                element_dict['root_words'] = a_texts
                book_info['lines'].append(element_dict)
                


            return 1
        except StaleElementReferenceException:
            attempts_remaining -= 1
            print(f"Stale element reference while loading {chapter_name}. Retrying...")
            time.sleep(1)
    if attempts_remaining == 0:
        print(f"{chapter_name} of {book_info['name']} could not be loaded.")
        return 0


    

In [14]:
def process_book(book_container, book_name):
    book_option = book_container.find_element(By.XPATH, f".//option[contains(text(), '{book_name}')]")
    book_option.click()

    max_attempts = 5
    attempt = 0 
    # Wait for the text box containing the book names to load
    wait = WebDriverWait(driver, 20)
    
    book_info = {}
    book_info['name'] = book_name
    book_info['time'] = ""
    unloaded_chapter_exists = False

    while attempt < max_attempts:
        try:
            chapter_container = wait.until(EC.presence_of_element_located((By.ID, "chapter_id"))) 
            wait.until(lambda driver: len(driver.find_elements(By.XPATH, '//*[@id="chapter_id"]/option')) > 0)
            chapter_list = [chapter.text for chapter in driver.find_elements(By.XPATH, '//*[@id="chapter_id"]/option')] # <--- list of all chapters in that book
            book_info['lines'] = []
            print(chapter_list)
            for chapter in chapter_list:
                # print(chapter)
                status = process_chapter(book_info, chapter_container=chapter_container, chapter_name=chapter)
                if status == 0:
                    unloaded_chapter_exists = True
                    break
            if unloaded_chapter_exists:
                attempt += 1
                print("A chapter couldn't be loaded after multiple retries. Reloading the whole book...")
                continue
            else:
                cleaned_book_name = re.sub(r'\?', '', book_info['name'])
                with open(f"{cleaned_book_name}.json",'w') as json_file:
                    json.dump(book_info, json_file, indent=4)
                return 1
        except StaleElementReferenceException:
            attempt += 1
            print(attempt)
            print(f"Stale element reference while loading {book_name}. Retrying...")
            time.sleep(1)

    if attempt == max_attempts:
        print("Failed to locate the element after multiple attempts.")
    if unloaded_chapter_exists:
        print(f"book {book_info['name']} couldn't be fully loaded after multiple attempts.")
    return 0


In [5]:
# Find the text box or dropdown that contains the book names
book_container = wait.until(EC.presence_of_element_located((By.ID, "text_id")))  # replace with the actual ID or locator


book_list = [book.text for book in driver.find_elements(By.XPATH, '//*[@id="text_id"]/option')] # <--- LIST OF ALL BOOKS

In [15]:
incomplete_loads = []
for book_name in book_list[256:257]:
    book_status = process_book(book_container=book_container, book_name=book_name)
    if book_status == 0:
        incomplete_loads.append(book_name)

if(len(incomplete_loads)):
    print("Books that didn't load completely:")
    for book in incomplete_loads:
        print(book)


['ĀVDīp, 0', 'ĀVDīp, 0', 'ĀVDīp, 0', 'ĀVDīp zu Ca, Sū., 1, 1', 'ĀVDīp zu Ca, Sū., 1, 2', 'ĀVDīp zu Ca, Sū., 1, 15.1', 'ĀVDīp zu Ca, Sū., 1, 18.1', 'ĀVDīp zu Ca, Sū., 1, 23.2', 'ĀVDīp zu Ca, Sū., 1, 24.2', 'ĀVDīp zu Ca, Sū., 1, 26.2', 'ĀVDīp zu Ca, Sū., 1, 29.2', 'ĀVDīp zu Ca, Sū., 1, 31.2', 'ĀVDīp zu Ca, Sū., 1, 43.2', 'ĀVDīp zu Ca, Sū., 1, 44.2', 'ĀVDīp zu Ca, Sū., 6, 2', 'ĀVDīp zu Ca, Sū., 6, 3.2', 'ĀVDīp zu Ca, Sū., 6, 4.2', 'ĀVDīp zu Ca, Sū., 6, 5.2', 'ĀVDīp zu Ca, Sū., 6, 6', 'ĀVDīp zu Ca, Sū., 6, 7', 'ĀVDīp zu Ca, Sū., 6, 8.3', 'ĀVDīp zu Ca, Sū., 11, 43', 'ĀVDīp zu Ca, Sū., 12, 2', 'ĀVDīp zu Ca, Sū., 12, 3', 'ĀVDīp zu Ca, Sū., 12, 4', 'ĀVDīp zu Ca, Sū., 12, 5', 'ĀVDīp zu Ca, Sū., 12, 7.2', 'ĀVDīp zu Ca, Sū., 12, 8.5', 'ĀVDīp zu Ca, Sū., 12, 11', 'ĀVDīp zu Ca, Sū., 12, 12', 'ĀVDīp zu Ca, Sū., 20, 2', 'ĀVDīp zu Ca, Sū., 20, 3', 'ĀVDīp zu Ca, Sū., 20, 4', 'ĀVDīp zu Ca, Sū., 20, 5', 'ĀVDīp zu Ca, Sū., 20, 6', 'ĀVDīp zu Ca, Sū., 20, 7', 'ĀVDīp zu Ca, Sū., 20, 8', 'ĀVDīp zu Ca, Sū., 20

In [12]:
book_list.index('Haribhaktivilāsa')

76

In [None]:
book_list

['AMTest',
 'Abhidharmakośa',
 'Abhidharmakośabhāṣya',
 'Abhidhānacintāmaṇi',
 'Abhinavacintāmaṇi',
 'Acintyastava',
 'Agastīyaratnaparīkṣā',
 'Agnipurāṇa',
 'Aitareya-Āraṇyaka',
 'Aitareyabrāhmaṇa',
 'Aitareyopaniṣad',
 'Amarakośa',
 'Amaraughaśāsana',
 'Amaruśataka',
 'Amṛtabindūpaniṣat',
 'Antagaḍadasāo',
 'Arthaśāstra',
 'Atharvaprāyaścittāni',
 'Atharvaveda (Paippalāda)',
 'Atharvaveda (Śaunaka)',
 'Atharvavedapariśiṣṭa',
 'Avadānaśataka',
 'Ayurvedarasāyana',
 'Aṣṭasāhasrikā',
 'Aṣṭādhyāyī',
 'Aṣṭāvakragīta',
 'Aṣṭāṅgahṛdayasaṃhitā',
 'Aṣṭāṅganighaṇṭu',
 'Aṣṭāṅgasaṃgraha',
 'Baudhāyanadharmasūtra',
 'Baudhāyanagṛhyasūtra',
 'Baudhāyanaśrautasūtra',
 'Bhadrabāhucarita',
 'Bhairavastava',
 'Bhallaṭaśataka',
 'Bhramarāṣṭaka',
 'Bhāgavatapurāṇa',
 'Bhāradvājagṛhyasūtra',
 'Bhāradvājaśrautasūtra',
 'Bhāratamañjarī',
 'Bhāvaprakāśa',
 'Bodhicaryāvatāra',
 'Brahmabindūpaniṣat',
 'Buddhacarita',
 'Bījanighaṇṭu',
 'Bṛhadāraṇyakopaniṣad',
 'Bṛhatkathāślokasaṃgraha',
 'Cakra (?) on Suśr',
 

: 