# AWS Docs Processor

This notebook processes AWS documentation PDFs by:
1. Loading each PDF using DoclingLoader with MARKDOWN export type
2. Converting to markdown format (`docs_as_markdown = loader.load()`)
3. Saving the processed documents to pickle files

**Output:**
- Individual service files: `{service_name}_docs.pkl` 
- Combined file: `all_aws_docs.pkl`
- Summary file: `processing_summary.json`

All files are saved in the `./aws_docs_processed/` directory.


In [4]:
import csv
import io
import os
import pickle
import time
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType


In [5]:
# 1. Define your data - AWS Services CSV
CSV_DATA = """Domain,Service,PDF_URL
Compute,ec2,https://docs.aws.amazon.com/pdfs/AWSEC2/latest/UserGuide/ec2-ug.pdf
"""


In [6]:
# 2. Define output directory for saved documents
OUTPUT_DIR = "./aws_docs_processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")


Output directory: ./aws_docs_processed


In [7]:
def parse_csv_data(csv_data):
    """Parses the in-memory CSV string into a list of dictionaries."""
    service_docs = []
    f = io.StringIO(csv_data)
    reader = csv.DictReader(f)
    for row in reader:
        service_docs.append(row)
    return service_docs

# Parse the CSV data
service_docs = parse_csv_data(CSV_DATA)
print(f"‚úÖ Found {len(service_docs)} AWS services to process")


‚úÖ Found 1 AWS services to process


In [None]:
# 3. Process each AWS service document
# This will load each PDF, convert to markdown, and save the results

all_processed_docs = {}  # Dictionary to store all processed documents
failed_services = []  # Track services that failed to process

total_services = len(service_docs)

for i, service in enumerate(service_docs):
    domain = service['Domain']
    service_name = service['Service']
    pdf_url = service['PDF_URL']
    
    print(f"\n--- Processing {i+1}/{total_services}: {domain} - {service_name} ---")
    print(f"URL: {pdf_url}")
    
    try:
        start_time = time.time()
        
        # Initialize DoclingLoader with MARKDOWN export type
        print("Initializing DoclingLoader...")
        loader = DoclingLoader(
            file_path=pdf_url,
            export_type=ExportType.MARKDOWN,
        )
        
        # Load and convert PDF to markdown
        print("Loading document (this may take a while)...")
        docs_as_markdown = loader.load()
        
        if not docs_as_markdown:
            print(f"‚ö†Ô∏è No content extracted from {service_name}. Skipping.")
            failed_services.append(service_name)
            continue
        
        # Store the processed documents
        all_processed_docs[service_name] = {
            'domain': domain,
            'service': service_name,
            'url': pdf_url,
            'docs': docs_as_markdown,
            'num_docs': len(docs_as_markdown)
        }
        
        # Save individual service document
        service_output_file = os.path.join(OUTPUT_DIR, f"{service_name}_docs.pkl")
        with open(service_output_file, 'wb') as f:
            pickle.dump(docs_as_markdown, f)
        print(f"üíæ Saved {service_name} to {service_output_file}")
        
        end_time = time.time()
        processing_time = end_time - start_time
        print(f"‚úÖ Successfully processed {service_name} in {processing_time:.2f} seconds ({processing_time/60:.2f} minutes)")
        print(f"   Extracted {len(docs_as_markdown)} document(s)")
        
    except Exception as e:
        print(f"‚ùå Error processing {service_name}: {e}")
        failed_services.append(service_name)
        continue

print(f"\n--- Processing Complete ---")
print(f"‚úÖ Successfully processed: {len(all_processed_docs)}/{total_services} services")
if failed_services:
    print(f"‚ùå Failed services: {failed_services}")



--- Processing 1/1: Compute - ec2 ---
URL: https://docs.aws.amazon.com/pdfs/AWSEC2/latest/UserGuide/ec2-ug.pdf
Initializing DoclingLoader...
Loading document (this may take a while)...


2025-11-13 22:23:51,795 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-13 22:23:54,599 - INFO - Going to convert document batch...
2025-11-13 22:23:54,599 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-13 22:23:54,753 - INFO - Loading plugin 'docling_defaults'
2025-11-13 22:23:54,765 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-13 22:23:54,787 - INFO - Loading plugin 'docling_defaults'
2025-11-13 22:23:54,813 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-13 22:23:54,979 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-11-13 22:23:54,982 - INFO - easyocr cannot be used because it is not installed.
2025-11-13 22:23:55,882 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-13 22:23:55,924 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-13 22:23:56,061 [RapidOCR] download_f

In [None]:
# 4. Save all processed documents to a single file
all_docs_output_file = os.path.join(OUTPUT_DIR, "all_aws_docs.pkl")
with open(all_docs_output_file, 'wb') as f:
    pickle.dump(all_processed_docs, f)

print(f"üíæ Saved all processed documents to {all_docs_output_file}")
print(f"   Total services processed: {len(all_processed_docs)}")

# Also save a summary/metadata file
summary = {
    'total_services': len(all_processed_docs),
    'failed_services': failed_services,
    'services': {name: {
        'domain': info['domain'],
        'service': info['service'],
        'url': info['url'],
        'num_docs': info['num_docs']
    } for name, info in all_processed_docs.items()}
}

import json
summary_file = os.path.join(OUTPUT_DIR, "processing_summary.json")
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"üíæ Saved processing summary to {summary_file}")


## How to Load Saved Documents

To load the saved documents in another notebook, use:

```python
import pickle
import os

# Load a specific service
service_name = "ec2"
with open(f"./aws_docs_processed/{service_name}_docs.pkl", 'rb') as f:
    docs_as_markdown = pickle.load(f)

# Or load all documents
with open("./aws_docs_processed/all_aws_docs.pkl", 'rb') as f:
    all_aws_docs = pickle.load(f)

# Access a specific service
ec2_docs = all_aws_docs['ec2']['docs']
```


In [None]:
# Example: Load saved documents (uncomment to use)
# import pickle
# 
# # Load a specific service
# service_name = "ec2"
# with open(f"./aws_docs_processed/{service_name}_docs.pkl", 'rb') as f:
#     docs_as_markdown = pickle.load(f)
# 
# print(f"Loaded {len(docs_as_markdown)} documents for {service_name}")
# print(f"First document preview: {docs_as_markdown[0].page_content[:200]}...")


In [None]:
# 5. Verify saved data - Load and check one example
if all_processed_docs:
    # Get first service as example
    first_service = list(all_processed_docs.keys())[0]
    example_docs = all_processed_docs[first_service]['docs']
    
    print(f"Example: {first_service}")
    print(f"  Number of documents: {len(example_docs)}")
    if example_docs:
        print(f"  First document type: {type(example_docs[0])}")
        print(f"  First document metadata keys: {list(example_docs[0].metadata.keys()) if hasattr(example_docs[0], 'metadata') else 'N/A'}")
        print(f"  First document content preview (first 200 chars):")
        print(f"  {example_docs[0].page_content[:200]}...")
    
    # Verify we can load from pickle file
    print(f"\n‚úÖ Verification: Loading from saved file...")
    with open(os.path.join(OUTPUT_DIR, f"{first_service}_docs.pkl"), 'rb') as f:
        loaded_docs = pickle.load(f)
    print(f"   Successfully loaded {len(loaded_docs)} documents from pickle file")
