In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import pandas as pd
import json

BASE_URL = "https://docs.aws.amazon.com/"
LANDING_XML = "https://docs.aws.amazon.com/en_us/main-landing-page.xml"

In [2]:
CORE_SERVICES = {
    "lambda", "ec2", "s3", "iam", "rds", "dynamodb", "glue", "athena",
    "emr", "redshift", "sns", "sqs", "cloudformation", "cloudwatch",
    "kinesis", "ecs", "eks", "api-gateway", "sagemaker", "vpc",
    "route53", "cloudtrail", "kms", "ssm", "secrets-manager", "opensearch",
    "elasticsearch", "lakeformation", "batch", "elasticbeanstalk",
    "transfer", "backup", "cloudfront", "appsync", "step-functions"
}

In [3]:
def fetch(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to fetch {url}: {e}")
        return ""

In [4]:
def get_service_links():
    xml = fetch(LANDING_XML)
    soup = BeautifulSoup(xml, "xml")

    service_links = []
    for item in soup.find_all("list-card-item"):
        href = item.get("href")
        if not href:
            continue
        href_path = href.split("?")[0]
        parts = href_path.strip("/").split("/")
        if not parts:
            continue
        name = parts[0].lower()
        if name in CORE_SERVICES:
            title_tag = item.find("title")
            title = title_tag.get_text(strip=True) if title_tag else name
            full_url = urljoin(BASE_URL, href_path)
            service_links.append({"title": title, "url": full_url})
    return service_links

service_links = get_service_links()
print(f"‚úÖ Found {len(service_links)} candidate services.")

‚úÖ Found 43 candidate services.


In [5]:

def normalize_service_name(title: str) -> str:
    t = title.lower()
    mapping = {
        "amazon elastic compute cloud": "ec2",
        "aws lambda": "lambda",
        "amazon simple storage service": "s3",
        "amazon dynamodb": "dynamodb",
        "amazon relational database service": "rds",
        "amazon redshift": "redshift",
        "aws glue": "glue",
        "amazon cloudwatch": "cloudwatch",
        "amazon sns": "sns",
        "amazon sqs": "sqs",
        "aws step functions": "step-functions",
        "aws key management service": "kms",
    }
    for key, val in mapping.items():
        if key in t:
            return val
    if "aws " in t:
        t = t.split("aws ")[1]
    elif "amazon " in t:
        t = t.split("amazon ")[1]
    return t.split()[0].replace(" ", "-").replace("(", "").replace(")", "")

In [6]:
def find_pdf_url(service_url):
    """
    Try multiple common subpaths, follow redirects, and tolerate errors.
    """
    try:
        r = requests.get(service_url, allow_redirects=True, timeout=10)
        final_url = r.url
    except Exception:
        final_url = service_url

    try:
        parts = urlsplit(final_url)
        base = f"{parts.scheme}://{parts.netloc}{parts.path}"
        if not base.endswith("/"):
            base += "/"

        subpaths = [
            "", "latest/dg/", "latest/userguide/", "latest/developerguide/",
            "latest/APIReference/", "userguide/", "dg/", "APIReference/"
        ]

        for sub in subpaths:
            meta_url = urljoin(base, f"{sub}meta-inf/guide-info.json")
            try:
                resp = requests.get(meta_url, timeout=8)
                if resp.status_code == 200:
                    data = resp.json()
                    if "pdf" in data and data["pdf"]:
                        return urljoin(urljoin(base, sub), data["pdf"])
            except Exception:
                continue
    except Exception:
        pass
    return None

In [7]:
results = []
seen = set()

for entry in service_links:
    title = entry["title"].strip()
    service = normalize_service_name(title)
    if service in seen:
        continue

    pdf_url = None
    try:
        pdf_url = find_pdf_url(entry["url"])
    except Exception as e:
        print(f"‚ö†Ô∏è Failed for {service}: {e}")

    # if not pdf_url and service in SERVICE_PDF_FALLBACKS:
    #     pdf_url = SERVICE_PDF_FALLBACKS[service][0]

    if pdf_url:
        results.append({
            "service": service,
            "title": title,
            "pdf_url": pdf_url
        })
        seen.add(service)

df_pdfs = pd.DataFrame(results).drop_duplicates("service").sort_values("service")
df_pdfs.reset_index(drop=True, inplace=True)
df_pdfs


Unnamed: 0,service,title,pdf_url
0,appsync,AWS AppSync,https://docs.aws.amazon.com/pdfs/appsync/lates...
1,athena,Amazon Athena,https://docs.aws.amazon.com/pdfs/athena/latest...
2,batch,AWS Batch,https://docs.aws.amazon.com/pdfs/batch/latest/...
3,cloudfront,Amazon CloudFront,https://docs.aws.amazon.com/pdfs/cloudfront/la...
4,cloudwatch,Amazon CloudWatch,https://docs.aws.amazon.com/pdfs/cloudwatch/la...
5,eks,Amazon EKS,https://docs.aws.amazon.com/pdfs/eks/latest/us...
6,emr,Amazon EMR,https://docs.aws.amazon.com/pdfs/emr/latest/AP...
7,glue,AWS Glue,https://docs.aws.amazon.com/pdfs/glue/latest/d...
8,kinesis,Amazon Kinesis,https://docs.aws.amazon.com/pdfs/kinesis/lates...
9,kms,AWS KMS,https://docs.aws.amazon.com/pdfs/kms/latest/de...


In [8]:
print(f"‚úÖ Found {len(df_pdfs)} AWS core service PDFs.")

# Preview
display(df_pdfs.head(20))

# Optional JSON export
mapping = dict(zip(df_pdfs["service"], df_pdfs["pdf_url"]))
with open("aws_core_docs.json", "w") as f:
    json.dump(mapping, f, indent=2)
print("üíæ Saved to aws_core_docs.json")

‚úÖ Found 17 AWS core service PDFs.


Unnamed: 0,service,title,pdf_url
0,appsync,AWS AppSync,https://docs.aws.amazon.com/pdfs/appsync/lates...
1,athena,Amazon Athena,https://docs.aws.amazon.com/pdfs/athena/latest...
2,batch,AWS Batch,https://docs.aws.amazon.com/pdfs/batch/latest/...
3,cloudfront,Amazon CloudFront,https://docs.aws.amazon.com/pdfs/cloudfront/la...
4,cloudwatch,Amazon CloudWatch,https://docs.aws.amazon.com/pdfs/cloudwatch/la...
5,eks,Amazon EKS,https://docs.aws.amazon.com/pdfs/eks/latest/us...
6,emr,Amazon EMR,https://docs.aws.amazon.com/pdfs/emr/latest/AP...
7,glue,AWS Glue,https://docs.aws.amazon.com/pdfs/glue/latest/d...
8,kinesis,Amazon Kinesis,https://docs.aws.amazon.com/pdfs/kinesis/lates...
9,kms,AWS KMS,https://docs.aws.amazon.com/pdfs/kms/latest/de...


üíæ Saved to aws_core_docs.json


In [9]:
CORE_SERVICES = [
    "ec2", "s3", "lambda", "iam", "rds", "dynamodb", "glue", "athena", "emr",
    "redshift", "sns", "sqs", "cloudformation", "cloudwatch", "kinesis",
    "ecs", "eks", "api-gateway", "sagemaker", "opensearch", "vpc",
    "route53", "cloudtrail", "kms", "ssm", "secrets-manager", "batch",
    "lakeformation", "elasticbeanstalk", "cloudfront", "appsync", "transfer",
    "backup", "datapipeline", "athena", "logs", "quicksight", "codepipeline",
    "codecommit", "codebuild", "codeartifact"
]

In [10]:
from urllib.parse import urljoin

BASE_URL = "https://docs.aws.amazon.com/"

def service_doc_root(service):
    return urljoin(BASE_URL, f"{service}/")

service_roots = [service_doc_root(s) for s in CORE_SERVICES]
len(service_roots)

41

In [11]:
import requests
from urllib.parse import urljoin, urlsplit

def find_pdf_url(service_url):
    """
    Try multiple possible AWS doc subpaths and case variations for guide-info.json
    """
    parts = urlsplit(service_url)
    base = f"{parts.scheme}://{parts.netloc}{parts.path}"
    if not base.endswith("/"):
        base += "/"

    candidate_subpaths = [
        "",
        "latest/dg/",
        "latest/userguide/",
        "latest/UserGuide/",
        "latest/developerguide/",
        "latest/DeveloperGuide/",
        "APIReference/",
        "latest/APIReference/",
        "developer-guide/",
    ]

    for subpath in candidate_subpaths:
        meta_url = urljoin(base, f"{subpath}meta-inf/guide-info.json")
        try:
            r = requests.get(meta_url, timeout=8)
            if r.status_code == 200:
                data = r.json()
                if "pdf" in data and data["pdf"]:
                    pdf_url = urljoin(urljoin(base, subpath), data["pdf"])
                    return pdf_url
        except Exception:
            continue
    return None


In [12]:
results = []

for service in CORE_SERVICES:
    base_url = f"https://docs.aws.amazon.com/{service}/"
    pdf_url = find_pdf_url(base_url)
    if pdf_url:
        results.append({"service": service, "pdf_url": pdf_url})

import pandas as pd
df_pdfs = pd.DataFrame(results)
print(f"‚úÖ Found {len(df_pdfs)} of {len(CORE_SERVICES)} core service PDFs")
df_pdfs


‚úÖ Found 24 of 41 core service PDFs


Unnamed: 0,service,pdf_url
0,lambda,https://docs.aws.amazon.com/pdfs/lambda/latest...
1,glue,https://docs.aws.amazon.com/pdfs/glue/latest/d...
2,athena,https://docs.aws.amazon.com/pdfs/athena/latest...
3,emr,https://docs.aws.amazon.com/pdfs/emr/latest/AP...
4,redshift,https://docs.aws.amazon.com/pdfs/redshift/late...
5,sns,https://docs.aws.amazon.com/pdfs/sns/latest/dg...
6,cloudwatch,https://docs.aws.amazon.com/pdfs/cloudwatch/la...
7,kinesis,https://docs.aws.amazon.com/pdfs/kinesis/lates...
8,eks,https://docs.aws.amazon.com/pdfs/eks/latest/us...
9,sagemaker,https://docs.aws.amazon.com/pdfs/sagemaker/lat...


In [None]:
Domain, Service, PDF_URL
Compute, ec2,  https://docs.aws.amazon.com/pdfs/AWSEC2/latest/UserGuide/ec2-ug.pdf
Compute, lambda, https://docs.aws.amazon.com/pdfs/lambda/latest/dg/lambda-dg.
Compute, ecs, https://docs.aws.amazon.com/pdfs/AmazonECS/latest/developerguide/ecs-dg.pdf
Compute, eks, https://docs.aws.amazon.com/pdfs/eks/latest/userguide/eks-ug.pdf
Compute, elastic-beanstalk, https://docs.aws.amazon.com/pdfs/elasticbeanstalk/latest/dg/awseb-dg.pdf
Compute, batch, https://docs.aws.amazon.com/pdfs/batch/latest/userguide/batch_user.pdf
Storage, s3, https://docs.aws.amazon.com/pdfs/AmazonS3/latest/userguide/s3-userguide.pdf
Storage, ebs, https://docs.aws.amazon.com/pdfs/ebs/latest/userguide/ebs-ug.pdf
Storage, efs, https://docs.aws.amazon.com/pdfs/efs/latest/ug/efs-ug.pdf
Storage, glacier, https://docs.aws.amazon.com/pdfs/amazonglacier/latest/dev/glacier-dg.pdf
Networking, vpc, https://docs.aws.amazon.com/pdfs/vpc/latest/userguide/vpc-ug.pdf
Networking, route53, https://docs.aws.amazon.com/pdfs/Route53/latest/DeveloperGuide/route53-dg.pdf
Networking, cloudfront, https://docs.aws.amazon.com/pdfs/AmazonCloudFront/latest/DeveloperGuide/AmazonCloudFront_DevGuide.pdf
Networking, api-gateway, https://docs.aws.amazon.com/pdfs/apigateway/latest/developerguide/apigateway-dg.pdf
Networking, elasticloadbalancing, https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/userguide/elb-ug.pdf
Networking, application-load-balancer, https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/application/elb-ag.pdf
Networking, network-load-balancer, https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/network/elb-ng.pdf
Networking, gateway-load-balancer, https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/gateway/elb-gateway.pdf
Security, iam, https://docs.aws.amazon.com/pdfs/IAM/latest/UserGuide/iam-ug.pdf
Security, kms, https://docs.aws.amazon.com/pdfs/kms/latest/developerguide/kms-dg.pdf
Security, secrets-manager, https://docs.aws.amazon.com/pdfs/secretsmanager/latest/userguide/secretsmanager-userguide.pdf
Security, cognito, https://docs.aws.amazon.com/pdfs/cognito/latest/developerguide/cognito-dg.pdf
Security, cloudtrail, https://docs.aws.amazon.com/pdfs/awscloudtrail/latest/userguide/awscloudtrail-ug.pdf
Database, rds, https://docs.aws.amazon.com/pdfs/AmazonRDS/latest/UserGuide/rds-ug.pdf
Database, dynamodb, https://docs.aws.amazon.com/pdfs/amazondynamodb/latest/developerguide/dynamodb-dg.pdf
Database, redshift, https://docs.aws.amazon.com/pdfs/redshift/latest/dg/redshift-dg.pdf
Database, elasticache, https://docs.aws.amazon.com/pdfs/AmazonElastiCache/latest/dg/redis-ug.pdf
Management, cloudwatch, https://docs.aws.amazon.com/pdfs/AmazonCloudWatch/latest/monitoring/acw-ug.pdf
Management, cloudformation, https://docs.aws.amazon.com/pdfs/AWSCloudFormation/latest/UserGuide/cfn-ug.pdf
Management, ssm, https://docs.aws.amazon.com/pdfs/systems-manager/latest/userguide/systems-manager-ug.pdf
Management, codepipeline, https://docs.aws.amazon.com/pdfs/codepipeline/latest/userguide/codepipeline-user.pdf
Management, codebuild, https://docs.aws.amazon.com/pdfs/codebuild/latest/userguide/codebuild-user.pdf
Management, codeartifact, https://docs.aws.amazon.com/pdfs/codeartifact/latest/ug/codeartifact-user.pdf
ApplicationIntegration, sqs, https://docs.aws.amazon.com/pdfs/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-dg.pdf
ApplicationIntegration, sns, https://docs.aws.amazon.com/pdfs/sns/latest/dg/sns-dg.pdf
ApplicationIntegration, step-functions, https://docs.aws.amazon.com/pdfs/step-functions/latest/dg/step-functions-dg.pdf
ApplicationIntegration, eventbridge, https://docs.aws.amazon.com/pdfs/eventbridge/latest/userguide/user-guide.pdf
Analytics, quicksight, https://docs.aws.amazon.com/pdfs/quicksuite/latest/userguide/amazon-quicksuite-user.pdf
Analytics, athena, https://docs.aws.amazon.com/pdfs/athena/latest/ug/athena-ug.pdf
Analytics, glue, https://docs.aws.amazon.com/pdfs/glue/latest/dg/glue-dg.pdf
Analytics, emr, https://docs.aws.amazon.com/pdfs/emr/latest/ManagementGuide/emr-mgmt.pdf
Analytics, kinesis, https://docs.aws.amazon.com/pdfs/streams/latest/dev/kinesis-dg.pdf
Analytics, opensearch, https://docs.aws.amazon.com/pdfs/opensearch-service/latest/developerguide/opensearch-service-dg.pdf
Analytics, sagemaker, https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf
Analytics, lakeformation, https://docs.aws.amazon.com/pdfs/lake-formation/latest/dg/lake-formation-dg.pdf
Analytics, datapipeline, https://docs.aws.amazon.com/pdfs/datapipeline/latest/DeveloperGuide/datapipeline-dg.pdf

In [None]:
https://docs.aws.amazon.com/databases-on-aws-how-to-choose
https://docs.aws.amazon.com/compute-on-aws-how-to-choose

In [None]:
https://docs.aws.amazon.com/pdfs/decision-guides/latest/analytics-on-aws-how-to-choose/analytics-on-aws-how-to-choose.pdf