In [3]:
import os
from PyPDF2 import PdfReader, PdfWriter
from multiprocessing import Pool

def is_blank_page(page, black_threshold=0.5):
    #Text scraper that looks for characters on the pdf
    content = page.extract_text().strip()
    if len(content) > 0:
        return False
    
    # Get the page's dimensions using rectangle coordinates
    x0, y0, x1, y1 = page.mediabox.lower_left + page.mediabox.upper_right
    width = x1 - x0
    height = y1 - y0

    # Check if the page has image content
    if '/XObject' not in page['/Resources']:
        return True

    # Get the page's pixel data as a byte string
    xobjects = page['/Resources']['/XObject']
    image_objects = [xobjects[obj] for obj in xobjects if xobjects[obj]['/Subtype'] == '/Image']
    if not image_objects:
        return True

    # Calculate the percentage of black pixels relative to the total number of pixels
    total_pixels = width * height
    black_pixels = 0

    for image_object in image_objects:
        image_data = image_object.get_data()
        black_pixels += sum(1 for pixel in image_data if pixel < 128)

    black_percentage = black_pixels / total_pixels

    return black_percentage <= black_threshold



def process_pdf_file(file_path, output_folder, black_threshold=0.5):
    filename = os.path.basename(file_path)
    filename = filename[:-4]
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        total_pages = len(reader.pages)
        non_blank_pages = []
        blank = []

        for page_number in range(total_pages):
            page = reader.pages[page_number]
            is_blank = is_blank_page(page, black_threshold)

            if not is_blank:
                non_blank_pages.append(page)
            else:
                blank.append(page)

        if non_blank_pages:
            for index, page in enumerate(non_blank_pages):
                # Write non-blank pages as separate files to output folder
                output_file_path = os.path.join(output_folder, filename + '_page_{}.pdf'.format(index))
                writer = PdfWriter()
                writer.add_page(page)
                with open(output_file_path, 'wb') as output_file:
                    writer.write(output_file)

def sort_pdf_files(input_folder, output_folder, batch_size=100, black_threshold=0.5):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Gather input PDF files
    pdf_files = []
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    pdf_files = pdf_files[:20]
    # Process PDF files in parallel using multiprocessing Pool
    with Pool() as pool:
        print("Pool stuff happening")
        for i in range(0, len(pdf_files), batch_size):
            batch_files = pdf_files[i:i+batch_size]
            pool.starmap(process_pdf_file, [(file, output_folder, black_threshold) for file in batch_files])

    print("PDF sorting completed.")

# Example usage
input_folder = "/project/arcc-students/csloan5/OilWellCards_project/test_set/"  # Replace with the path to the input folder containing subfolders with PDFs
output_folder = "/project/arcc-students/csloan5/OilWellCards_project/test_output/"  # Replace with the path to the output folder
# test_file = "/project/arcc-students/csloan5/OilWellCards_project/test_set/cluster1/144-0037.pdf"

sort_pdf_files(input_folder, output_folder, batch_size=100, black_threshold=0.5)
# not_blank, blank = process_pdf_file(test_file, "hi", 0.5)


Pool stuff happening
PDF sorting completed.
