# Preprocessing

This notebook does preprocessing of PDFs, with the goal of
- converting pdf to image files
- getting one image per page
- hashing the input
- making a table

In [0]:
%pip install pymupdf
%restart_python

## Ingest Files
Download process and instrumentation diagrams. We use an md5 hash to encode the file name and ensure uniqueness.

In [0]:
import requests
import hashlib

url = "https://open.alberta.ca/dataset/46ddba1a-7b86-4d7c-b8b6-8fe33a60fada/resource/a82b9bc3-37a9-4447-8d2f-f5b55a5c3353/download/facilitydrawings.pdf"
hashed_url = hashlib.md5(url.encode()).hexdigest()
raw_vol_path = "/Volumes/shm/pid/raw_pdfs/"
image_vol_path = "/Volumes/shm/pid/pdf_images/"

raw_pdf_file_path = raw_vol_path + hashed_url + '.pdf'

response = requests.get(url)
with open(raw_pdf_file_path, 'wb') as file:
    file.write(response.content)

We currently use PyMuPDF for splitting the pages. The license isn't great, but there are many other options. This is the simplest for now.

In [0]:
import fitz  # PyMuPDF
from pathlib import Path

doc_dir = Path(image_vol_path) / hashed_url
doc_dir.mkdir(exist_ok=True)

doc = fitz.open(raw_pdf_file_path)
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    pix = page.get_pixmap(dpi=200)
    pix.save(doc_dir / f"page_{page_num+1}.jpeg")

In [0]:
from pathlib import Path
from PIL import Image
import io

def convert_jpeg_to_webp(directory, max_size_kb=500, min_quality=60, min_width=300):
    """
    Convert JPEG images in a directory to WEBP format, ensuring the file size is under a specified limit.

    Parameters:
    directory (str or Path): The directory containing JPEG images to convert.
    max_size_kb (int, optional): The maximum file size for the WEBP images in kilobytes. Default is 500 KB.
    min_quality (int, optional): The minimum quality for the WEBP images. Default is 60.
    min_width (int, optional): The minimum width for the WEBP images. Default is 300 pixels.
    """
    max_size = max_size_kb * 1024  # 500 KB in bytes
    min_width = 300  # Don't shrink images below this width

    for jpg_path in Path(directory).glob('*.jp*g'):
        img = Image.open(jpg_path)
        img = img.convert('RGB')
        quality = 85
        width, height = img.size

        while True:
            # Save to buffer to check file size
            buffer = io.BytesIO()
            img.save(buffer, 'webp', quality=quality)
            size = buffer.tell()

            if size <= max_size or (quality <= min_quality and width <= min_width):
                # Save final image
                img.save(jpg_path.with_suffix('.webp'), 'webp', quality=quality)
                print(f"Saved {jpg_path.with_suffix('.webp')} at {size // 1024} KB, quality={quality}, width={width}")
                break

            if quality > min_quality:
                quality -= 5
            else:
                # Reduce width by 10% and maintain aspect ratio
                width = int(width * 0.9)
                if width < min_width:
                    width = min_width
                height = int(height * (width / img.size[0]))
                img = img.resize((width, height), Image.LANCZOS)

In [0]:
convert_jpeg_to_webp(doc_dir)

In [0]:
from pathlib import Path
from PIL import Image

def tile_image_with_overlap(image_path, output_dir, overlap_percent=10):
    img = Image.open(image_path)
    width, height = img.size

    cols, rows = 4, 2  # 8 tiles: 4 columns x 2 rows

    # Compute base tile size (without overlap)
    base_tile_width = width // cols
    base_tile_height = height // rows

    # Compute overlap in pixels
    overlap_x = int(base_tile_width * overlap_percent / 100)
    overlap_y = int(base_tile_height * overlap_percent / 100)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    tile_num = 1
    for row in range(rows):
        for col in range(cols):
            # Calculate the starting x/y
            left = col * (base_tile_width - overlap_x)
            upper = row * (base_tile_height - overlap_y)

            # For the last column/row, ensure we reach the image edge
            if col == cols - 1:
                right = width
            else:
                right = left + base_tile_width

            if row == rows - 1:
                lower = height
            else:
                lower = upper + base_tile_height

            # Clamp to image boundaries
            left = max(0, left)
            upper = max(0, upper)
            right = min(width, right)
            lower = min(height, lower)

            tile = img.crop((left, upper, right, lower))
            tile_path = output_dir / f"tile_{tile_num}.png"
            tile.save(tile_path)
            print(f"Saved {tile_path} (left={left}, upper={upper}, right={right}, lower={lower})")
            tile_num += 1

    print(f"Tiling complete with {overlap_percent}% overlap.")


In [0]:
path = '/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_13.jpeg'
output_dir = '/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee_tiled/'
tile_image_with_overlap(path,output_dir)