In [5]:
import sys
import os
import platform
from typing import List

def find_pdfs_based_on_os() -> List[str]:
    """
    Automatically detect the operating system, choose appropriate directories,
    and find all PDF files starting from those directories.
    Returns a list of paths to the found PDF files.
    """
    def handle_os_error(err):
        print(f"OS error: {err}", file=sys.stderr)

    def get_start_directories() -> List[str]:
        """ Return appropriate start directories based on the operating system. """
        os_type = platform.system()
        if os_type == 'Windows':
            base_dir = os.environ.get('USERPROFILE', 'C:\\Users')  # Default to C:\Users if USERPROFILE is not set
            return [os.path.join(base_dir, 'Documents')]  # Start at the Documents folder
        elif os_type == 'Darwin':  # macOS
            return ['/Users']  # Start at the Users directory
        return ['/']  # Fallback for other OS types

    def find_pdfs(directory: str) -> List[str]:
        """ Recursively find all PDF files in the specified directory, handling permissions gracefully. """
        pdf_files = []
        for root, dirs, files in os.walk(directory, onerror=handle_os_error):
            for file in files:
                if file.lower().endswith('.pdf'):
                    pdf_files.append(os.path.join(root, file))
        return pdf_files

    pdfs = []
    start_directories = get_start_directories()
    for directory in start_directories:
        print(f"Scanning {directory} for PDF files...")
        pdfs.extend(find_pdfs(directory))
    
    return pdfs

# Usage
pdf_files = find_pdfs_based_on_os()
print(f"Found {len(pdf_files)} PDFs.")
for pdf in pdf_files[:10]:  # Print the first 10 PDF paths
    print(pdf)

Scanning /Users for PDF files...


OS error: [Errno 13] Permission denied: '/Users/yecao/.config/truffle'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Pictures/Photos Library.photoslibrary'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/CallHistoryTransactions'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/CloudDocs/session/db'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/com.apple.sharedfilelist'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/Knowledge'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/com.apple.TCC'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/FileProvider'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/AddressBook'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/FaceTime'

Found 773 PDFs.
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/135/Certifier Training Handouts.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/307/Position Update Process.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/61/Check I-9 Status.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/95/Finance Dashboard View Fund Balances.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/300/Cómo ingresar horas en Web Clock.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/132/SPFF Labor Distribution.pdf


In [2]:
import sys
import os
import platform
from typing import List
import hashlib

def find_pdfs_based_on_os() -> List[str]:
    """
    Find all unique PDF files based on content, considering the operating system.
    Returns a list of paths to the unique PDF files.
    """
    def handle_os_error(err):
        print(f"OS error: {err}", file=sys.stderr)

    def get_start_directories() -> List[str]:
        """ Return appropriate start directories based on the operating system. """
        os_type = platform.system()
        if os_type == 'Windows':
            base_dir = os.environ.get('USERPROFILE', 'C:\\Users')
            return [os.path.join(base_dir, 'Documents')]
        elif os_type == 'Darwin':  # macOS
            return ['/Users']
        return ['/']

    def hash_file(filepath: str) -> str:
        """ Generate a SHA-256 hash of a file's contents. """
        hasher = hashlib.sha256()
        try:
            with open(filepath, 'rb') as f:
                buffer = f.read(65536)  # Read the file in chunks (e.g., 64kB)
                while len(buffer) > 0:
                    hasher.update(buffer)
                    buffer = f.read(65536)
        except IOError as e:
            print(f"Error reading file {filepath}: {str(e)}", file=sys.stderr)
            return None
        return hasher.hexdigest()

    def find_pdfs(directory: str, seen_hashes: set) -> List[str]:
        """ Recursively find all unique PDF files in the specified directory, handling permissions gracefully. """
        pdf_files = []
        for root, dirs, files in os.walk(directory, onerror=handle_os_error):
            for file in files:
                if file.lower().endswith('.pdf'):
                    full_path = os.path.join(root, file)
                    file_hash = hash_file(full_path)
                    if file_hash and file_hash not in seen_hashes:
                        seen_hashes.add(file_hash)
                        pdf_files.append(full_path)
        return pdf_files

    seen_hashes = set()
    pdfs = []
    start_directories = get_start_directories()
    for directory in start_directories:
        print(f"Scanning {directory} for PDF files...")
        pdfs.extend(find_pdfs(directory, seen_hashes))
    
    return pdfs

# Usage
pdf_files = find_pdfs_based_on_os()
print(f"Found {len(pdf_files)} unique PDFs based on content.")
for pdf in pdf_files[:10]:  # Print the first 10 PDF paths
    print(pdf)

Scanning /Users for PDF files...


OS error: [Errno 13] Permission denied: '/Users/yecao/.config/truffle'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Pictures/Photos Library.photoslibrary'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/CallHistoryTransactions'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/CloudDocs/session/db'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/com.apple.sharedfilelist'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/Knowledge'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/com.apple.TCC'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/FileProvider'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/AddressBook'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/FaceTime'

Found 605 unique PDFs based on content.
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/135/Certifier Training Handouts.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/307/Position Update Process.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/61/Check I-9 Status.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/95/Finance Dashboard View Fund Balances.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/300/Cómo ingresar horas en Web Clock.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/132/SPFF