In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
# Base URL of the Human Rights Library
base_url = "http://hrlibrary.umn.edu/instree/ainstls1.htm"
general_base_url = "http://hrlibrary.umn.edu"
# File path for valid URLs
output_valid_urls = "URL.txt"
# List of languages and file types to exclude
excluded_languages = ['/arab/', '/spanish/', '/russian/', '/japanese/', '/K-', '/chinese/', '/Chinese/', '/french/', '/farsi/']
excluded_file_types = ['.pdf', '.doc']


In [None]:
# Function to check if a URL is valid
def is_url_valid(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)  # Use HEAD to check URL
        return response.status_code == 200  # Return True if the status code is 200
    except requests.RequestException:
        return False  # Return False if any request exception occurs
# Function to check if a URL should be excluded
def should_exclude_url(url):
    return any(lang in url for lang in excluded_languages) or any(url.endswith(ft) for ft in excluded_file_types)


In [None]:
# Function to scrape links from the main page
def scrape_links():
    # Send a GET request to the main page
    response = requests.get(base_url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all links to documents
        links = soup.find_all('a')
        # Extract and store the document links
        valid_urls = []
        for link in links:
            href = link.get('href')
            if href:
                full_url = urljoin(base_url, href)  # Convert relative URLs to absolute URLs
                # Check if the URL starts with the general base URL, is valid, and does not contain excluded languages or file types
                if full_url.startswith(general_base_url) and not should_exclude_url(full_url):
                    if is_url_valid(full_url):
                        valid_urls.append(full_url)
        # Remove duplicates by converting to a set and then back to a list
        unique_valid_urls = list(set(valid_urls))
        # Sort the URLs alphabetically (optional)
        unique_valid_urls.sort()
        # Save valid URLs to a text file
        with open(output_valid_urls, 'w') as valid_file:
            for url in unique_valid_urls:
                valid_file.write(url + '\n')
        print(f"Saved {len(unique_valid_urls)} unique valid URLs to {output_valid_urls}.")
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


In [None]:
# Run the scraping function
if __name__ == "__main__":
    scrape_links()