In [1]:
import os
import logging
import shutil
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.common.exceptions import WebDriverException
from pathlib import Path
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time
import os



# scraper for RDMS website to HTML files

# each chapter is one file

# configuration of selenium  webdriver is needed to run this code



In [3]:
import os
from dotenv import load_dotenv
import logging

def configure_environment():
    # Use absolute path for driver cache
    cache_path = os.path.join(os.path.expanduser("~"), ".wdm")
    os.makedirs(cache_path, exist_ok=True)

    # Set environment variables
    os.environ["WDM_LOCAL"] = "1"
    os.environ["WDM_CACHE_PATH"] = cache_path

    # Load other environment variables
    env_path = os.path.join(os.getcwd(), "..", ".env")
    load_dotenv(env_path)

    logging.debug(f"WDM_LOCAL: {os.environ['WDM_LOCAL']}")
    logging.debug(f"WDM_CACHE_PATH: {os.environ['WDM_CACHE_PATH']}")

    return cache_path


def setup_logging():
    log_level = os.getenv("LOG_LEVEL", "INFO")
    log_level = log_level.upper()
    logging.basicConfig(
        level=getattr(logging, log_level),
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    logging.info(f"Logging is set to {log_level} level")


def get_driver_options():
    options = webdriver.EdgeOptions()
    options.add_argument("--start-maximized")

    # Add headless mode if specified in environment
    if os.getenv("HEADLESS_MODE", "False").lower() == "true":
        options.add_argument("--headless")

    return options


def setup_edge_driver():
    setup_logging()
    configure_environment()

    try:
        # Create WebDriver Manager instance
        driver_path = EdgeChromiumDriverManager().install()

        # Set up the service and driver
        service = Service(driver_path)
        options = get_driver_options()
        driver = webdriver.Edge(service=service, options=options)
        driver.set_page_load_timeout(30)

        logging.info("EdgeDriver successfully initialized")
        return driver

    except WebDriverException as we:
        logging.error(
            f"WebDriverException occurred while setting up EdgeDriver: {str(we)}",
            exc_info=True,
        )
        raise
    except Exception as e:
        logging.error(
            f"Unexpected error while setting up EdgeDriver: {str(e)}", exc_info=True
        )
        raise



In [17]:
chromedriver_path ='C:\\chrome-testing\\chromedriver-win64\\chromedriver.exe' #change this path for your environment

service = Service(chromedriver_path)

driver = webdriver.Chrome(service=service)

In [5]:
configure_environment()
setup_logging()

driver_path = EdgeChromiumDriverManager().install()

def save_html(driver, filename):
    """Save the current page's HTML to a file."""
    html_content = driver.page_source
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html_content)

def download_chapter_htmls(base_url, driver_path):
    service = Service(driver_path)

    driver = setup_edge_driver()

    try:
        driver.get(base_url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "manual"))
        )

        chapters = driver.find_elements(By.CSS_SELECTOR, "#manual li a")
        links_and_names = [(chapter.get_attribute('href'), chapter.text.strip()) for chapter in chapters if chapter.get_attribute('href')]

        if not os.path.exists('MPEP_HTML'):
            os.makedirs('MPEP_HTML')

        for link, name in links_and_names:
            driver.get(link)
            time.sleep(3)

            valid_name = ''.join(c for c in name if c.isalnum() or c.isspace()).strip()

            save_html(driver, f'MPEP_HTML/{valid_name}.html')

    finally:
        driver.quit()

download_chapter_htmls('https://mpep.uspto.gov/RDMS/MPEP/current', driver_path)