# AWS Docs Processor

This notebook processes AWS documentation PDFs by:
1. Loading each PDF using DoclingLoader with MARKDOWN export type
2. Converting to markdown format (`docs_as_markdown = loader.load()`)
3. Saving the processed documents to pickle files

**Output:**
- Individual service files: `{service_name}_docs.pkl` 
- Combined file: `all_aws_docs.pkl`
- Summary file: `processing_summary.json`

All files are saved in the `./aws_docs_processed/` directory.


In [None]:
import csv
import io
import os
import pickle
import time
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType


In [None]:
# 1. Define your data - AWS Services CSV
CSV_DATA = """Domain,Service,PDF_URL
Compute,ec2,https://docs.aws.amazon.com/pdfs/AWSEC2/latest/UserGuide/ec2-ug.pdf
Compute,lambda,https://docs.aws.amazon.com/pdfs/lambda/latest/dg/lambda-dg.pdf
Compute,ecs,https://docs.aws.amazon.com/pdfs/AmazonECS/latest/developerguide/ecs-dg.pdf
Compute,eks,https://docs.aws.amazon.com/pdfs/eks/latest/userguide/eks-ug.pdf
Compute,elastic-beanstalk,https://docs.aws.amazon.com/pdfs/elasticbeanstalk/latest/dg/awseb-dg.pdf
Compute,batch,https://docs.aws.amazon.com/pdfs/batch/latest/userguide/batch_user.pdf
Storage,s3,https://docs.aws.amazon.com/pdfs/AmazonS3/latest/userguide/s3-userguide.pdf
Storage,ebs,https://docs.aws.amazon.com/pdfs/ebs/latest/userguide/ebs-ug.pdf
Storage,efs,https://docs.aws.amazon.com/pdfs/efs/latest/ug/efs-ug.pdf
Storage,glacier,https://docs.aws.amazon.com/pdfs/amazonglacier/latest/dev/glacier-dg.pdf
Networking,vpc,https://docs.aws.amazon.com/pdfs/vpc/latest/userguide/vpc-ug.pdf
Networking,route53,https://docs.aws.amazon.com/pdfs/Route53/latest/DeveloperGuide/route53-dg.pdf
Networking,cloudfront,https://docs.aws.amazon.com/pdfs/AmazonCloudFront/latest/DeveloperGuide/AmazonCloudFront_DevGuide.pdf
Networking,api-gateway,https://docs.aws.amazon.com/pdfs/apigateway/latest/developerguide/apigateway-dg.pdf
Networking,elasticloadbalancing,https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/userguide/elb-ug.pdf
Networking,application-load-balancer,https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/application/elb-ag.pdf
Networking,network-load-balancer,https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/network/elb-ng.pdf
Networking,gateway-load-balancer,https://docs.aws.amazon.com/pdfs/elasticloadbalancing/latest/gateway/elb-gateway.pdf
Security,iam,https://docs.aws.amazon.com/pdfs/IAM/latest/UserGuide/iam-ug.pdf
Security,kms,https://docs.aws.amazon.com/pdfs/kms/latest/developerguide/kms-dg.pdf
Security,secrets-manager,https://docs.aws.amazon.com/pdfs/secretsmanager/latest/userguide/secretsmanager-userguide.pdf
Security,cognito,https://docs.aws.amazon.com/pdfs/cognito/latest/developerguide/cognito-dg.pdf
Security,cloudtrail,https://docs.aws.amazon.com/pdfs/awscloudtrail/latest/userguide/awscloudtrail-ug.pdf
Database,rds,https://docs.aws.amazon.com/pdfs/AmazonRDS/latest/UserGuide/rds-ug.pdf
Database,dynamodb,https://docs.aws.amazon.com/pdfs/amazondynamodb/latest/developerguide/dynamodb-dg.pdf
Database,redshift,https://docs.aws.amazon.com/pdfs/redshift/latest/dg/redshift-dg.pdf
Database,elasticache,https://docs.aws.amazon.com/pdfs/AmazonElastiCache/latest/dg/redis-ug.pdf
Management,cloudwatch,https://docs.aws.amazon.com/pdfs/AmazonCloudWatch/latest/monitoring/acw-ug.pdf
Management,cloudformation,https://docs.aws.amazon.com/pdfs/AWSCloudFormation/latest/UserGuide/cfn-ug.pdf
Management,ssm,https://docs.aws.amazon.com/pdfs/systems-manager/latest/userguide/systems-manager-ug.pdf
Management,codepipeline,https://docs.aws.amazon.com/pdfs/codepipeline/latest/userguide/codepipeline-user.pdf
Management,codebuild,https://docs.aws.amazon.com/pdfs/codebuild/latest/userguide/codebuild-user.pdf
Management,codeartifact,https://docs.aws.amazon.com/pdfs/codeartifact/latest/ug/codeartifact-user.pdf
ApplicationIntegration,sqs,https://docs.aws.amazon.com/pdfs/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-dg.pdf
ApplicationIntegration,sns,https://docs.aws.amazon.com/pdfs/sns/latest/dg/sns-dg.pdf
ApplicationIntegration,step-functions,https://docs.aws.amazon.com/pdfs/step-functions/latest/dg/step-functions-dg.pdf
ApplicationIntegration,eventbridge,https://docs.aws.amazon.com/pdfs/eventbridge/latest/userguide/user-guide.pdf
Analytics,quicksight,https://docs.aws.amazon.com/pdfs/quicksuite/latest/userguide/amazon-quicksuite-user.pdf
Analytics,athena,https://docs.aws.amazon.com/pdfs/athena/latest/ug/athena-ug.pdf
Analytics,glue,https://docs.aws.amazon.com/pdfs/glue/latest/dg/glue-dg.pdf
Analytics,emr,https://docs.aws.amazon.com/pdfs/emr/latest/ManagementGuide/emr-mgmt.pdf
Analytics,kinesis,https://docs.aws.amazon.com/pdfs/streams/latest/dev/kinesis-dg.pdf
Analytics,opensearch,https://docs.aws.amazon.com/pdfs/opensearch-service/latest/developerguide/opensearch-service-dg.pdf
Analytics,sagemaker,https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf
Analytics,lakeformation,https://docs.aws.amazon.com/pdfs/lake-formation/latest/dg/lake-formation-dg.pdf
Analytics,datapipeline,https://docs.aws.amazon.com/pdfs/datapipeline/latest/DeveloperGuide/datapipeline-dg.pdf
"""


In [None]:
# 2. Define output directory for saved documents
OUTPUT_DIR = "./aws_docs_processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")


In [None]:
def parse_csv_data(csv_data):
    """Parses the in-memory CSV string into a list of dictionaries."""
    service_docs = []
    f = io.StringIO(csv_data)
    reader = csv.DictReader(f)
    for row in reader:
        service_docs.append(row)
    return service_docs

# Parse the CSV data
service_docs = parse_csv_data(CSV_DATA)
print(f"‚úÖ Found {len(service_docs)} AWS services to process")


In [None]:
# 3. Process each AWS service document
# This will load each PDF, convert to markdown, and save the results

all_processed_docs = {}  # Dictionary to store all processed documents
failed_services = []  # Track services that failed to process

total_services = len(service_docs)

for i, service in enumerate(service_docs):
    domain = service['Domain']
    service_name = service['Service']
    pdf_url = service['PDF_URL']
    
    print(f"\n--- Processing {i+1}/{total_services}: {domain} - {service_name} ---")
    print(f"URL: {pdf_url}")
    
    try:
        start_time = time.time()
        
        # Initialize DoclingLoader with MARKDOWN export type
        print("Initializing DoclingLoader...")
        loader = DoclingLoader(
            file_path=pdf_url,
            export_type=ExportType.MARKDOWN,
        )
        
        # Load and convert PDF to markdown
        print("Loading document (this may take a while)...")
        docs_as_markdown = loader.load()
        
        if not docs_as_markdown:
            print(f"‚ö†Ô∏è No content extracted from {service_name}. Skipping.")
            failed_services.append(service_name)
            continue
        
        # Store the processed documents
        all_processed_docs[service_name] = {
            'domain': domain,
            'service': service_name,
            'url': pdf_url,
            'docs': docs_as_markdown,
            'num_docs': len(docs_as_markdown)
        }
        
        # Save individual service document
        service_output_file = os.path.join(OUTPUT_DIR, f"{service_name}_docs.pkl")
        with open(service_output_file, 'wb') as f:
            pickle.dump(docs_as_markdown, f)
        print(f"üíæ Saved {service_name} to {service_output_file}")
        
        end_time = time.time()
        processing_time = end_time - start_time
        print(f"‚úÖ Successfully processed {service_name} in {processing_time:.2f} seconds ({processing_time/60:.2f} minutes)")
        print(f"   Extracted {len(docs_as_markdown)} document(s)")
        
    except Exception as e:
        print(f"‚ùå Error processing {service_name}: {e}")
        failed_services.append(service_name)
        continue

print(f"\n--- Processing Complete ---")
print(f"‚úÖ Successfully processed: {len(all_processed_docs)}/{total_services} services")
if failed_services:
    print(f"‚ùå Failed services: {failed_services}")


In [None]:
# 4. Save all processed documents to a single file
all_docs_output_file = os.path.join(OUTPUT_DIR, "all_aws_docs.pkl")
with open(all_docs_output_file, 'wb') as f:
    pickle.dump(all_processed_docs, f)

print(f"üíæ Saved all processed documents to {all_docs_output_file}")
print(f"   Total services processed: {len(all_processed_docs)}")

# Also save a summary/metadata file
summary = {
    'total_services': len(all_processed_docs),
    'failed_services': failed_services,
    'services': {name: {
        'domain': info['domain'],
        'service': info['service'],
        'url': info['url'],
        'num_docs': info['num_docs']
    } for name, info in all_processed_docs.items()}
}

import json
summary_file = os.path.join(OUTPUT_DIR, "processing_summary.json")
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"üíæ Saved processing summary to {summary_file}")


## How to Load Saved Documents

To load the saved documents in another notebook, use:

```python
import pickle
import os

# Load a specific service
service_name = "ec2"
with open(f"./aws_docs_processed/{service_name}_docs.pkl", 'rb') as f:
    docs_as_markdown = pickle.load(f)

# Or load all documents
with open("./aws_docs_processed/all_aws_docs.pkl", 'rb') as f:
    all_aws_docs = pickle.load(f)

# Access a specific service
ec2_docs = all_aws_docs['ec2']['docs']
```


In [None]:
# Example: Load saved documents (uncomment to use)
# import pickle
# 
# # Load a specific service
# service_name = "ec2"
# with open(f"./aws_docs_processed/{service_name}_docs.pkl", 'rb') as f:
#     docs_as_markdown = pickle.load(f)
# 
# print(f"Loaded {len(docs_as_markdown)} documents for {service_name}")
# print(f"First document preview: {docs_as_markdown[0].page_content[:200]}...")


In [None]:
# 5. Verify saved data - Load and check one example
if all_processed_docs:
    # Get first service as example
    first_service = list(all_processed_docs.keys())[0]
    example_docs = all_processed_docs[first_service]['docs']
    
    print(f"Example: {first_service}")
    print(f"  Number of documents: {len(example_docs)}")
    if example_docs:
        print(f"  First document type: {type(example_docs[0])}")
        print(f"  First document metadata keys: {list(example_docs[0].metadata.keys()) if hasattr(example_docs[0], 'metadata') else 'N/A'}")
        print(f"  First document content preview (first 200 chars):")
        print(f"  {example_docs[0].page_content[:200]}...")
    
    # Verify we can load from pickle file
    print(f"\n‚úÖ Verification: Loading from saved file...")
    with open(os.path.join(OUTPUT_DIR, f"{first_service}_docs.pkl"), 'rb') as f:
        loaded_docs = pickle.load(f)
    print(f"   Successfully loaded {len(loaded_docs)} documents from pickle file")
