In [3]:
pip install PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Collecting typing_extensions>=3.10.0.0
  Downloading typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Installing collected packages: typing-extensions, PyPDF2
Successfully installed PyPDF2-3.0.1 typing-extensions-4.7.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from PyPDF2 import PdfReader, PdfWriter

def is_blank_page(page, black_threshold=0.4):
    content = page.extract_text().strip()
    if len(content) > 0:
        return False
    
    # Get the page's dimensions using rectangle coordinates
    x0, y0, x1, y1 = page.mediabox.lower_left + page.mediabox.upper_right
    width = x1 - x0
    height = y1 - y0

    # Check if the page has image content
    if '/XObject' not in page['/Resources']:
        return True

    # Get the page's pixel data as a byte string
    xobjects = page['/Resources']['/XObject']
    image_objects = [xobjects[obj] for obj in xobjects if xobjects[obj]['/Subtype'] == '/Image']
    if not image_objects:
        return True

    # Calculate the percentage of black pixels relative to the total number of pixels
    total_pixels = width * height
    black_pixels = 0

    for image_object in image_objects:
        image_data = image_object.get_data()
        black_pixels += sum(1 for pixel in image_data if pixel < 128)

    black_percentage = black_pixels / total_pixels

    return black_percentage <= black_threshold


def split_pdf_by_blank_pages(input_folder, blank_output_folder, non_blank_output_folder, batch_size=1000, black_threshold=0.4):
    for root, _, files in os.walk(input_folder):
        for filename in files:
            if filename.endswith('.pdf'):
                input_pdf_path = os.path.join(root, filename)

                with open(input_pdf_path, 'rb') as file:
                    reader = PdfReader(file)
                    total_pages = len(reader.pages)

                    for start_page in range(0, total_pages, batch_size):
                        end_page = min(start_page + batch_size, total_pages)
                        output_pages = []

                        for page_number in range(start_page, end_page):
                            page = reader.pages[page_number]
                            is_blank = is_blank_page(page, black_threshold)

                            if is_blank:
                                output_folder = blank_output_folder
                                output_pages.append((page, output_folder, filename, page_number + 1))
                            else:
                                output_folder = non_blank_output_folder
                                output_pages.append((page, output_folder, filename, page_number + 1))
                            

                        # Process the batch of pages
                        for page, output_folder, filename, page_number in output_pages:
                            output_path = os.path.join(output_folder, f'{filename}_page_{page_number}.pdf')

                            writer = PdfWriter()
                            writer.add_page(page)

                            with open(output_path, 'wb') as output_file:
                                writer.write(output_file)

    print("PDF processing completed.")

# Example usage
input_folder = '/project/arcc-students/csloan5/OilWellCards/vertical_cards' # Replace with the path to the input folder containing subfolders with PDFs
blank_output_folder = 'BlankPages/VertBlank'  # Replace with the path to the output folder for blank pages
non_blank_output_folder = 'BlankPages/VertNonBlank'  # Replace with the path to the output folder for non-blank pages

# Create the output folders if they don't exist
os.makedirs(blank_output_folder, exist_ok=True)
os.makedirs(non_blank_output_folder, exist_ok=True)

split_pdf_by_blank_pages(input_folder, blank_output_folder, non_blank_output_folder, batch_size=1000, black_threshold=0.4)

PDF processing completed.


In [2]:
import os

# Path to the folder
folder_path = 'CloudVisionText'

# Function to count files in a folder
def count_files(folder_path):
    file_count = 0

    # Iterate through the files in the folder
    for _, _, files in os.walk(folder_path):
        file_count += len(files)

    return file_count

# Count files in the folder
num_files = count_files(folder_path)

# Print the number of files
print(f"The folder '{folder_path}' contains {num_files} files.")

The folder 'CloudVisionText' contains 13661 files.


In [3]:
import os
from PyPDF2 import PdfReader, PdfWriter
from multiprocessing import Pool

def is_blank_page(page, black_threshold=0.4):
    content = page.extract_text().strip()
    if len(content) > 0:
        return False
    
    # Get the page's dimensions using rectangle coordinates
    x0, y0, x1, y1 = page.mediabox.lower_left + page.mediabox.upper_right
    width = x1 - x0
    height = y1 - y0

    # Check if the page has image content
    if '/XObject' not in page['/Resources']:
        return True

    # Get the page's pixel data as a byte string
    xobjects = page['/Resources']['/XObject']
    image_objects = [xobjects[obj] for obj in xobjects if xobjects[obj]['/Subtype'] == '/Image']
    if not image_objects:
        return True

    # Calculate the percentage of black pixels relative to the total number of pixels
    total_pixels = width * height
    black_pixels = 0

    for image_object in image_objects:
        image_data = image_object.get_data()
        black_pixels += sum(1 for pixel in image_data if pixel < 128)

    black_percentage = black_pixels / total_pixels

    return black_percentage <= black_threshold



def process_pdf_file(file_path, output_folder, black_threshold=0.4):
    filename = os.path.basename(file_path)

    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        total_pages = len(reader.pages)
        non_blank_pages = []

        for page_number in range(total_pages):
            page = reader.pages[page_number]
            is_blank = is_blank_page(page, black_threshold)

            if not is_blank:
                non_blank_pages.append(page)

        if non_blank_pages:
            # Write non-blank pages to output folder
            output_file_path = os.path.join(output_folder, filename)
            writer = PdfWriter()
            for page in non_blank_pages:
                writer.add_page(page)
            with open(output_file_path, 'wb') as output_file:
                writer.write(output_file)

def sort_pdf_files(input_folder, output_folder, batch_size=1000, black_threshold=0.4):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Gather input PDF files
    pdf_files = []
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))

    # Process PDF files in parallel using multiprocessing Pool
    with Pool() as pool:
        for i in range(0, len(pdf_files), batch_size):
            batch_files = pdf_files[i:i+batch_size]
            pool.starmap(process_pdf_file, [(file, output_folder, black_threshold) for file in batch_files])

    print("PDF sorting completed.")

# Example usage
input_folder = '/project/arcc-students/enhanced_oil_recovery_cards/BOX 12 (353-373)/'  # Replace with the path to the input folder containing subfolders with PDFs
output_folder = 'Box12Text'  # Replace with the path to the output folder

sort_pdf_files(input_folder, output_folder, batch_size=1000, black_threshold=0.4)


PDF sorting completed.
