In [14]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit

# AWS documentation base URL and landing XML
BASE_URL = "https://docs.aws.amazon.com/"
LANDING_XML = "https://docs.aws.amazon.com/en_us/main-landing-page.xml"

In [15]:
# Define the 30â€“40 AWS core services you care about
CORE_SERVICES = {
    "lambda", "ec2", "s3", "iam", "rds", "dynamodb", "glue", "athena",
    "emr", "redshift", "sns", "sqs", "cloudformation", "cloudwatch",
    "kinesis", "ecs", "eks", "api-gateway", "sagemaker", "vpc",
    "route53", "cloudtrail", "kms", "ssm", "secrets-manager", "opensearch",
    "elasticsearch", "athena", "lakeformation", "batch", "elasticbeanstalk",
    "transfer", "backup", "cloudfront", "appsync", "step-functions"
}

In [16]:
def fetch(url):
    """Download and return text content from a URL."""
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.text

In [17]:
xml = fetch(LANDING_XML)

In [18]:
xml

'<?xml version="1.0" encoding="UTF-8"?>\n<main-landing-page version="2.1">\n   <title>Welcome to AWS Documentation</title>\n   <abstract>Find user guides, code samples, SDKs &amp; toolkits, tutorials, API &amp; CLI references, and more.</abstract>\n   <feedbackCTI>Landing Pages</feedbackCTI>\n   <sections>\n      <section id="welcome">\n         <cards>\n            <list-card id="featured-content">\n               <title>Featured content</title>\n               <list-card-items>\n                  <list-card-item href="/ec2/?icmpid=docs_homepage_featuredsvcs" id="ec2">\n                     <title>\n                                Amazon EC2\n                            </title>\n                     <abstract>Create and run virtual servers in the cloud</abstract>\n                     <image src="images/service-icons/Arch_Amazon-EC2_32.svg">\n                        <alt>Amazon EC2 icon</alt>\n                     </image>\n                  </list-card-item>\n                  <list

In [19]:
soup = BeautifulSoup(xml, "xml")

In [20]:
soup

<?xml version="1.0" encoding="utf-8"?>
<main-landing-page version="2.1">
<title>Welcome to AWS Documentation</title>
<abstract>Find user guides, code samples, SDKs &amp; toolkits, tutorials, API &amp; CLI references, and more.</abstract>
<feedbackCTI>Landing Pages</feedbackCTI>
<sections>
<section id="welcome">
<cards>
<list-card id="featured-content">
<title>Featured content</title>
<list-card-items>
<list-card-item href="/ec2/?icmpid=docs_homepage_featuredsvcs" id="ec2">
<title>
                                Amazon EC2
                            </title>
<abstract>Create and run virtual servers in the cloud</abstract>
<image src="images/service-icons/Arch_Amazon-EC2_32.svg">
<alt>Amazon EC2 icon</alt>
</image>
</list-card-item>
<list-card-item href="/s3/?icmpid=docs_homepage_featuredsvcs" id="S3">
<title>
                                Amazon S3
                            </title>
<abstract>Object storage built to retrieve any amount of data from anywhere</abstract>
<image src="

In [21]:
service_links = []

In [35]:
containers = soup.find_all("list-card-items")
len(containers), containers[0]  # shows the first container so you can inspect it

(33,
 <list-card-items>
 <list-card-item href="/ec2/?icmpid=docs_homepage_featuredsvcs" id="ec2">
 <title>
                                 Amazon EC2
                             </title>
 <abstract>Create and run virtual servers in the cloud</abstract>
 <image src="images/service-icons/Arch_Amazon-EC2_32.svg">
 <alt>Amazon EC2 icon</alt>
 </image>
 </list-card-item>
 <list-card-item href="/s3/?icmpid=docs_homepage_featuredsvcs" id="S3">
 <title>
                                 Amazon S3
                             </title>
 <abstract>Object storage built to retrieve any amount of data from anywhere</abstract>
 <image src="images/service-icons/Arch_Amazon-Simple-Storage-Service_32.svg">
 <alt>Amazon S3 icon</alt>
 </image>
 </list-card-item>
 <list-card-item href="/dynamodb/?icmpid=docs_homepage_featuredsvcs" id="dynamodb">
 <title>
                                 Amazon DynamoDB
                             </title>
 <abstract>Managed NoSQL database service</abstract>
 <image src=

In [36]:
cards = soup.find_all("list-card-item")   # <- directly find the child elements
len(cards)

392

In [38]:
cards[0].get("href")

'/ec2/?icmpid=docs_homepage_featuredsvcs'

In [39]:
from urllib.parse import urljoin

service_entries = []
for card in cards:
    href = card.get("href")  # href is usually something like "/ec2/?icmpid=..."
    title_tag = card.find("title")
    title = title_tag.get_text(strip=True) if title_tag else None

    if not href:
        # sometimes href is missing (rare). skip those.
        continue

    # remove query params (we only need the path)
    href_path = href.split("?")[0]

    # normalize to absolute URL
    full_url = urljoin(BASE_URL, href_path)

    service_entries.append({"title": title, "href_raw": href, "href_path": href_path, "url": full_url})

# quick preview
service_entries[:10]

[{'title': 'Amazon EC2',
  'href_raw': '/ec2/?icmpid=docs_homepage_featuredsvcs',
  'href_path': '/ec2/',
  'url': 'https://docs.aws.amazon.com/ec2/'},
 {'title': 'Amazon S3',
  'href_raw': '/s3/?icmpid=docs_homepage_featuredsvcs',
  'href_path': '/s3/',
  'url': 'https://docs.aws.amazon.com/s3/'},
 {'title': 'Amazon DynamoDB',
  'href_raw': '/dynamodb/?icmpid=docs_homepage_featuredsvcs',
  'href_path': '/dynamodb/',
  'url': 'https://docs.aws.amazon.com/dynamodb/'},
 {'title': 'Amazon Relational Database Service',
  'href_raw': '/rds/?icmpid=docs_homepage_featuredsvcs',
  'href_path': '/rds/',
  'url': 'https://docs.aws.amazon.com/rds/'},
 {'title': 'AWS Lambda',
  'href_raw': '/lambda/?icmpid=docs_homepage_featuredsvcs',
  'href_path': '/lambda/',
  'url': 'https://docs.aws.amazon.com/lambda/'},
 {'title': 'Amazon VPC',
  'href_raw': '/vpc/?icmpid=docs_homepage_featuredsvcs',
  'href_path': '/vpc/',
  'url': 'https://docs.aws.amazon.com/vpc/'},
 {'title': 'Amazon SageMaker',
  'href_

In [42]:
import pandas as pd
df = pd.DataFrame(service_entries)
df

Unnamed: 0,title,href_raw,href_path,url
0,Amazon EC2,/ec2/?icmpid=docs_homepage_featuredsvcs,/ec2/,https://docs.aws.amazon.com/ec2/
1,Amazon S3,/s3/?icmpid=docs_homepage_featuredsvcs,/s3/,https://docs.aws.amazon.com/s3/
2,Amazon DynamoDB,/dynamodb/?icmpid=docs_homepage_featuredsvcs,/dynamodb/,https://docs.aws.amazon.com/dynamodb/
3,Amazon Relational Database Service,/rds/?icmpid=docs_homepage_featuredsvcs,/rds/,https://docs.aws.amazon.com/rds/
4,AWS Lambda,/lambda/?icmpid=docs_homepage_featuredsvcs,/lambda/,https://docs.aws.amazon.com/lambda/
...,...,...,...,...
387,JetBrains,/toolkit-for-jetbrains/?icmpid=docs_homepage_s...,/toolkit-for-jetbrains/,https://docs.aws.amazon.com/toolkit-for-jetbra...
388,PowerShell,/powershell/?icmpid=docs_homepage_sdktoolkits,/powershell/,https://docs.aws.amazon.com/powershell/
389,Amazon Q Developer,/amazonq/latest/qdeveloper-ug/what-is.html?icm...,/amazonq/latest/qdeveloper-ug/what-is.html,https://docs.aws.amazon.com/amazonq/latest/qde...
390,Visual Studio,/aws-toolkit-visual-studio/?icmpid=docs_homepa...,/aws-toolkit-visual-studio/,https://docs.aws.amazon.com/aws-toolkit-visual...


In [43]:
CORE_SERVICES = {
    "lambda", "ec2", "s3", "iam", "rds", "dynamodb", "glue", "athena",
    "emr", "redshift", "sns", "sqs", "cloudformation", "cloudwatch",
    "kinesis", "ecs", "eks", "api-gateway", "sagemaker", "vpc",
    "route53", "cloudtrail", "kms", "ssm", "secrets-manager", "opensearch",
    "elasticsearch", "lakeformation", "batch", "elasticbeanstalk",
    "transfer", "backup", "cloudfront", "appsync", "step-functions"
}

def path_contains_core(path):
    # split path and check first segment(s)
    parts = [p for p in urlsplit(path).path.split("/") if p]
    if not parts:
        return False
    first = parts[0].lower()
    # some services have multi-word names in path (e.g., next-generation-sagemaker -> sagemaker)
    return any(first == core or core in first for core in CORE_SERVICES)

filtered = [r for r in service_entries if path_contains_core(r["href_path"])]
len(filtered), filtered[:8]

(50,
 [{'title': 'Amazon EC2',
   'href_raw': '/ec2/?icmpid=docs_homepage_featuredsvcs',
   'href_path': '/ec2/',
   'url': 'https://docs.aws.amazon.com/ec2/'},
  {'title': 'Amazon S3',
   'href_raw': '/s3/?icmpid=docs_homepage_featuredsvcs',
   'href_path': '/s3/',
   'url': 'https://docs.aws.amazon.com/s3/'},
  {'title': 'Amazon DynamoDB',
   'href_raw': '/dynamodb/?icmpid=docs_homepage_featuredsvcs',
   'href_path': '/dynamodb/',
   'url': 'https://docs.aws.amazon.com/dynamodb/'},
  {'title': 'Amazon Relational Database Service',
   'href_raw': '/rds/?icmpid=docs_homepage_featuredsvcs',
   'href_path': '/rds/',
   'url': 'https://docs.aws.amazon.com/rds/'},
  {'title': 'AWS Lambda',
   'href_raw': '/lambda/?icmpid=docs_homepage_featuredsvcs',
   'href_path': '/lambda/',
   'url': 'https://docs.aws.amazon.com/lambda/'},
  {'title': 'Amazon VPC',
   'href_raw': '/vpc/?icmpid=docs_homepage_featuredsvcs',
   'href_path': '/vpc/',
   'url': 'https://docs.aws.amazon.com/vpc/'},
  {'title'

In [27]:
services = soup.find_all("list-card-items")

In [44]:
import requests
from urllib.parse import urljoin, urlsplit

def find_pdf_url(service_url):
    parts = urlsplit(service_url)
    base = f"{parts.scheme}://{parts.netloc}{parts.path}"
    if not base.endswith("/"):
        base += "/"
    meta_url = urljoin(base, "meta-inf/guide-info.json")
    try:
        r = requests.get(meta_url, timeout=10)
        if r.status_code == 200:
            data = r.json()
            if "pdf" in data and data["pdf"]:
                return urljoin(base, data["pdf"])
    except Exception:
        return None
    return None

results = []
for entry in filtered:
    pdf = find_pdf_url(entry["url"])
    results.append({"title": entry["title"], "service_url": entry["url"], "pdf_url": pdf})

# show found pdfs
import pandas as pd
pd.DataFrame(results).dropna(subset=["pdf_url"])

Unnamed: 0,title,service_url,pdf_url
6,Amazon SageMaker,https://docs.aws.amazon.com/next-generation-sa...,https://docs.aws.amazon.com/pdfs/next-generati...
7,Amazon SageMaker,https://docs.aws.amazon.com/next-generation-sa...,https://docs.aws.amazon.com/pdfs/next-generati...


In [46]:
import requests
from urllib.parse import urljoin, urlsplit

def find_pdf_url(service_url):
    """
    Try multiple known subpaths (dg, userguide, developerguide, APIReference)
    to find a valid meta-inf/guide-info.json and return the PDF URL.
    """
    parts = urlsplit(service_url)
    base = f"{parts.scheme}://{parts.netloc}{parts.path}"
    if not base.endswith("/"):
        base += "/"

    # Common AWS documentation folder structures
    candidate_subpaths = [
        "",  # direct root (some like SageMaker)
        "latest/dg/",
        "latest/userguide/",
        "latest/developerguide/",
        "latest/APIReference/",
    ]

    for subpath in candidate_subpaths:
        meta_url = urljoin(base, f"{subpath}meta-inf/guide-info.json")
        try:
            r = requests.get(meta_url, timeout=10)
            if r.status_code == 200:
                data = r.json()
                if "pdf" in data and data["pdf"]:
                    pdf_url = urljoin(urljoin(base, subpath), data["pdf"])
                    return pdf_url
        except Exception:
            continue
    return None

In [47]:
results = []
for entry in filtered:
    pdf = find_pdf_url(entry["url"])
    results.append({
        "title": entry["title"],
        "service_url": entry["url"],
        "pdf_url": pdf
    })

import pandas as pd
df_pdfs = pd.DataFrame(results).dropna(subset=["pdf_url"])
df_pdfs

Unnamed: 0,title,service_url,pdf_url
4,AWS Lambda,https://docs.aws.amazon.com/lambda/,https://docs.aws.amazon.com/pdfs/lambda/latest...
5,Amazon VPC,https://docs.aws.amazon.com/vpc/,https://docs.aws.amazon.com/pdfs/vpc/latest/us...
6,Amazon SageMaker,https://docs.aws.amazon.com/next-generation-sa...,https://docs.aws.amazon.com/pdfs/next-generati...
7,Amazon SageMaker,https://docs.aws.amazon.com/next-generation-sa...,https://docs.aws.amazon.com/pdfs/next-generati...
8,Amazon Athena,https://docs.aws.amazon.com/athena/,https://docs.aws.amazon.com/pdfs/athena/latest...
9,Amazon EMR,https://docs.aws.amazon.com/emr/,https://docs.aws.amazon.com/pdfs/emr/latest/AP...
10,AWS Glue,https://docs.aws.amazon.com/glue/,https://docs.aws.amazon.com/pdfs/glue/latest/d...
11,Amazon Kinesis,https://docs.aws.amazon.com/kinesis/,https://docs.aws.amazon.com/pdfs/kinesis/lates...
12,Amazon OpenSearch Service,https://docs.aws.amazon.com/opensearch-service/,https://docs.aws.amazon.com/pdfs/opensearch-se...
13,Amazon Redshift,https://docs.aws.amazon.com/redshift/,https://docs.aws.amazon.com/pdfs/redshift/late...


In [53]:
df_pdfs.iloc[0,2]

'https://docs.aws.amazon.com/pdfs/lambda/latest/dg/lambda-dg.pdf'

In [33]:
service_links = []

for item in services:
    href = item.get("href")
    title_tag = item.find("title")
    title = title_tag.get_text(strip=True) if title_tag else None
    if not href or not title:
        continue
    
    # Normalize
    href = href.split("?")[0]  # remove query params like ?icmpid=...
    full_url = urljoin(BASE_URL, href)
    
    service_links.append({
        "title": title,
        "url": full_url
    })

len(service_links)

0

In [34]:
services[0].get("href")


In [26]:
for service in soup.find_all("list-card-items"):
    print(service.get("href"))


None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [25]:
service_links

[]

In [13]:
service_links

[]

In [None]:



def get_service_links():
    """Parse AWS main landing XML and return service doc root URLs for selected services."""
    xml = fetch(LANDING_XML)
    soup = BeautifulSoup(xml, "xml")
    service_links = []
    for service in soup.find_all("service"):
        href = service.get("href")
        if not href:
            continue
        parts = href.strip("/").split("/")
        if not parts:
            continue
        name = parts[0].lower()
        if name in CORE_SERVICES:
            service_links.append(urljoin(BASE_URL, href))
    return service_links


def find_pdf_url(service_url):
    """Given a service doc root, return the PDF URL from guide-info.json."""
    parts = urlsplit(service_url)
    base = f"{parts.scheme}://{parts.netloc}{parts.path}"
    if not base.endswith("/"):
        base += "/"
    meta_url = urljoin(base, "meta-inf/guide-info.json")
    try:
        r = requests.get(meta_url, timeout=10)
        if r.status_code == 200:
            data = r.json()
            if "pdf" in data:
                return urljoin(base, data["pdf"])
    except Exception:
        pass
    return None
