# ***Install required external modules***

In [1]:
# Install PyMuPDF: Used to read PDF files and convert each page into images
# Install Boto3: AWS SDK for Python, used to interact with AWS services like S3
!pip install pymupdf boto3

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting boto3
  Downloading boto3-1.42.31-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<1.43.0,>=1.42.31 (from boto3)
  Downloading botocore-1.42.31-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3)
  Downloading s3transfer-0.16.0-py3-none-any.whl.metadata (1.7 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.42.31-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.42.31-py3-none-any.whl (14.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pymupdf  # PyMuPDF: Used to open and process PDF files (read pages, render them as images)
from PIL import Image  # Pillow: Handles image creation, manipulation, and format conversion
from typing import List  # Provides type hints (e.g., List[str]) for better code readability and maintenance
import boto3  # AWS SDK for Python: Used to interact with AWS services like S3
from io import BytesIO  # Creates in-memory binary streams, useful for uploading images to S3 without saving to disk


In [4]:
def pdf_to_images(
    pdf_path: str,
    zoom: float = 1.0
) -> List[Image.Image]:
    """
    Convert a local PDF file (Colab filesystem) into page-wise images.

    Args:
        pdf_path: Path to the PDF file (e.g. '/content/sample.pdf')
        zoom: Zoom factor for rendering pages
              (1.0 = default resolution, >1.0 = higher quality images)

    Returns:
        List of PIL Image objects, one per PDF page
    """

    # Validate that the input file is a PDF
    if not pdf_path.lower().endswith(".pdf"):
        raise ValueError("Input file must be a PDF")

    # Open the PDF document using PyMuPDF
    doc = pymupdf.open(pdf_path)

    # Create a transformation matrix to control image resolution
    # Higher zoom => higher DPI => better image quality
    matrix = pymupdf.Matrix(zoom, zoom)

    # List to store rendered page images
    images = []

    # Iterate through each page in the PDF
    for page in doc:
        # Render the page into a pixel map (image)
        # alpha=False disables transparency and uses RGB
        pix = page.get_pixmap(matrix=matrix, alpha=False)

        # Convert raw pixel data into a PIL Image
        img = Image.frombytes(
            "RGB",                  # Color mode
            (pix.width, pix.height),# Image dimensions
            pix.samples             # Raw pixel data
        )

        # Append the image to the list
        images.append(img)

    # Close the PDF document to free resources
    doc.close()

    # Return the list of page-wise images
    return images


In [5]:
def upload_images_to_s3(
    images: List[Image.Image],
    bucket_name: str,
    s3_prefix: str,
):
    """
    Upload PIL Images to an S3 folder as PNG files.

    Args:
        images: List of PIL Image objects (one per PDF page)
        bucket_name: Target S3 bucket name
        s3_prefix: Folder path in S3 (e.g. 'pdf_pages/output/')
    """

    # Create an S3 client
    # Credentials are automatically picked up from:
    # - IAM Role (recommended)
    # - or environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    s3 = boto3.client("s3",aws_access_key_id = '', aws_secret_access_key ='')

    # Iterate over images and upload them one by one
    for idx, img in enumerate(images, start=1):

        # Create an in-memory binary buffer
        # This avoids saving files locally on disk
        buffer = BytesIO()

        # Save the PIL image into the buffer as PNG
        img.save(buffer, format="PNG")

        # Move cursor back to the beginning of the buffer
        buffer.seek(0)

        # Construct the S3 object key (file path in bucket)
        s3_key = f"{s3_prefix.rstrip('/')}/page_{idx}.png"

        # Upload the image buffer directly to S3
        s3.upload_fileobj(
            buffer,                 # In-memory file object
            bucket_name,             # Target S3 bucket
            s3_key,                  # Object key (path)
            ExtraArgs={
                "ContentType": "image/png"  # Correct MIME type
            }
        )

        # Log upload confirmation
        print(f"Uploaded: s3://{bucket_name}/{s3_key}")


In [7]:
upload_images_to_s3(
    images = pdf_to_images("/content/Deloitte AWS banking.pdf"),
    bucket_name = "landinglayertest",
    s3_prefix = "mmragtest/"
)

Uploaded: s3://landinglayertest/mmragtest/page_1.png
Uploaded: s3://landinglayertest/mmragtest/page_2.png
Uploaded: s3://landinglayertest/mmragtest/page_3.png
Uploaded: s3://landinglayertest/mmragtest/page_4.png
Uploaded: s3://landinglayertest/mmragtest/page_5.png
Uploaded: s3://landinglayertest/mmragtest/page_6.png
Uploaded: s3://landinglayertest/mmragtest/page_7.png
Uploaded: s3://landinglayertest/mmragtest/page_8.png
Uploaded: s3://landinglayertest/mmragtest/page_9.png
Uploaded: s3://landinglayertest/mmragtest/page_10.png
Uploaded: s3://landinglayertest/mmragtest/page_11.png
Uploaded: s3://landinglayertest/mmragtest/page_12.png
Uploaded: s3://landinglayertest/mmragtest/page_13.png
Uploaded: s3://landinglayertest/mmragtest/page_14.png
Uploaded: s3://landinglayertest/mmragtest/page_15.png
Uploaded: s3://landinglayertest/mmragtest/page_16.png
Uploaded: s3://landinglayertest/mmragtest/page_17.png
Uploaded: s3://landinglayertest/mmragtest/page_18.png
Uploaded: s3://landinglayertest/mmrag