In [1]:
import requests
from bs4 import BeautifulSoup
import sys

In [2]:
def get_sitemap_urls(sitemap_index_url):
    """
    Fetches the main sitemap index and returns a list
    of all individual sitemap URLs.
    """
    sitemap_urls = []
    print(f"Fetching sitemap index from {sitemap_index_url}...", file=sys.stderr)
    try:
        response = requests.get(sitemap_index_url)
        response.raise_for_status()
        
        # Use 'xml' parser for BeautifulSoup
        soup = BeautifulSoup(response.content, 'xml')
        
        for loc in soup.find_all('loc'):
            sitemap_urls.append(loc.text)
            
    except requests.RequestException as e:
        print(f"Error fetching sitemap index: {e}", file=sys.stderr)
    
    print(f"Found {len(sitemap_urls)} sitemaps to parse.", file=sys.stderr)
    return sitemap_urls

In [3]:
def get_pdf_links_from_sitemap(sitemap_url):
    """
    Fetches an individual sitemap and yields all URLs
    that end in '.pdf'.
    """
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'xml')
        
        for loc in soup.find_all('loc'):
            url = loc.text
            if url.endswith('.pdf'):
                yield url
                
    except requests.RequestException as e:
        print(f"Warning: Could not parse {sitemap_url}: {e}", file=sys.stderr)

In [4]:
def find_all_aws_pdf_links():
    """
    Main function to find all AWS doc PDF links.
    It prints the links to standard output, one per line.
    """
    sitemap_index = "https://docs.aws.amazon.com/sitemap.xml"
    all_pdf_links = set()
    
    sitemaps = get_sitemap_urls(sitemap_index)
    
    if not sitemaps:
        print("No sitemaps found. Exiting.", file=sys.stderr)
        return

    for i, sitemap in enumerate(sitemaps):
        print(f"Parsing sitemap {i+1}/{len(sitemaps)}: {sitemap}", file=sys.stderr)
        
        for pdf_link in get_pdf_links_from_sitemap(sitemap):
            if pdf_link not in all_pdf_links:
                all_pdf_links.add(pdf_link)
                print(pdf_link) # Print to stdout as soon as it's found
    
    print(f"\nTotal unique PDF links found: {len(all_pdf_links)}", file=sys.stderr)

In [5]:
find_all_aws_pdf_links()

Fetching sitemap index from https://docs.aws.amazon.com/sitemap.xml...
Error fetching sitemap index: 404 Client Error: Not Found for url: https://docs.aws.amazon.com/sitemap.xml
Found 0 sitemaps to parse.
No sitemaps found. Exiting.
