In [None]:
def compare_to_existing(data: dict, filepath: str) -> bool:
    """Compare the current data with the data in an existing file.

    If the data is different, replace the file content with the new data.
    If the data is the same, update the `last_updated` field in the file.

    Args:
        data (dict): The current data to compare.
        filepath (str): The path to the file to compare with.

    Returns:
        bool: True if the file was updated or replaced, False otherwise.
    """
    try:
        if not os.path.exists(filepath):
            return True
        # Check if the file exists
        if os.path.exists(filepath):
            # Load existing data from the file
            with open(filepath, "r", encoding="utf-8") as file:
                existing_data = json.load(file)

            # Compare the new data with the existing data (excluding `accessed`)
            existing_data_copy = existing_data.copy()
            existing_data_copy.pop("accessed", None)  # Remove `accessed` for comparison
            if data == existing_data_copy:
                # If data is the same, just update the `accessed` field
                existing_data["accessed"] = date.today()
                with open(filepath, "w", encoding="utf-8") as file:
                    json.dump(existing_data, file, indent=4, ensure_ascii=False)
                return True

        # If data is different or file doesn't exist, replace it
        data["accessed"] = date.today()  # Add timestamp to new data
        with open(filepath, "w", encoding="utf-8") as file:
            json.dump(data, file, indent=4, ensure_ascii=False)
        return True
    except Exception as e:
        print(f"Error comparing or updating file at {filepath}: {e}")
        return False


In [None]:
def extract_relevant_content(soup: BeautifulSoup) -> str:
    """Extract relevant content from the <main id="content_wrapper"> or <main id="site-content">
    section of the page.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the page content.

    Returns:
        str: The cleaned and relevant content.
    """
    try:
        # Try to find the <main> tag first
        main_content = soup.find("main", {"id": "content_wrapper"})

        # If not found, try the <main id="site-content">
        if not main_content:
            log.warning("Main content not found.")
            main_content = soup.find("main", {"id": "site-content"})

        if not main_content:
            log.warning("Site content not found.")
            main_content = soup.find("div", {"id": "main-body"})

        if not main_content: 
            log.warning("Main-body not found.")
            main_content = soup.find("section", {"id": "sp-main-body"})

        if not main_content:
            log.warning("Sp-main-body not found.")
            main_content = soup.find("div", {"id": "main"})

        if not main_content:
            log.warning("Main not found.")
            main_content = soup.find("div", {"class": "sc-gsTCUz bhdLno"}) 

        if not main_content:
            log.warning("sc-gsTCUz bhdLno not found.")
            return ""

        # Remove unwanted tags like <nav>, <aside>, <script>, etc.
        for tag in main_content.find_all(["nav", "aside", "script", "style"]):
            tag.decompose()  # Remove the tag and its content

        # Extract text from the cleaned main content
        relevant_text = main_content.get_text(separator="\n", strip=True)

        return relevant_text
    except Exception as e:
        log.error(f"Error extracting relevant content: {e}")
        return ""

In [None]:
def save_to_file(content: dict, filename: str) -> None:
    """Saves content to a file in JSON format.

    Args:
        content (dict): The data to save.
        filename (str): The filename, including the path, to save the data.
    """
    if not os.path.isdir(directory):
        log.warning(f"Directory '{directory}' does not exist. Creating it.")
        os.makedirs(directory)

    log.info(f"Saving into {filename}.")
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(content, file, indent=4, ensure_ascii=False)

In [None]:
# headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])]
# paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]

In [None]:
def extract_content(url: str):
    log.info(f"Starting to extract content from {url}")
    try:    
        # Validate the URL
        if not url.startswith("http"):
            raise ValueError(f"Invalid URL format: {url}")

        # Fetch the page
        page = requests.get(url)
        page.raise_for_status()  # Raise HTTPError for bad responses
        
        # Parse the page content
        soup = BeautifulSoup(page.content, "html.parser")
        text_content = soup.get_text(separator="\n", strip=True).split("\n")

        # Extract a set of relevant links
        links_list = [urljoin(url, a["href"]) for a in soup.find_all("a", href=True)]
        relevant_links_list = [link for link in links_list if "hdm" in link]
        relevant_links_set = set(relevant_links_list)

        # Sanitize filename
        sanitized_url = extract_domain_part(url)
        if not sanitized_url:
            raise ValueError(f"Unable to extract domain part from URL: {url}")
        filename = f"{sanitized_url}.txt"
        filename_links = f"{sanitized_url}_links.txt"

        # Save to files
        save_to_file(content=text_content, filename=filename)
        save_to_file(content=relevant_links_set, filename=filename_links)

        log.info(f"Finished extracting content")
    
    except (requests.RequestException, ValueError) as e:
        # Catch network-related errors or invalid URL issues
        log.error(f"Skipping URL due to error: {e}")
    except Exception as e:
        # Catch all other unexpected exceptions
        log.error(f"An unexpected error occurred for URL {url}: {e}")


In [None]:
urls = [
    "https://www.hdm-stuttgart.de/studieninteressierte/studium/bachelor/steckbrief?sgang_ID=550045&sgang_cluster_ID=18",
    "https://www.hdm-stuttgart.de/studieninteressierte/studium/bachelor/steckbrief?sgang_ID=550045&sgang_cluster_ID=19",
    "https://www.hdm-stuttgart.de",
    "https://www.hdm-stuttgart.de/medianight"
]

# for url in urls:
#     extract_content(url)
