In [2]:
import sys
import os
import platform
from typing import List
import hashlib

def find_pdfs_based_on_os() -> List[str]:
    """
    Find all unique PDF files based on content, considering the operating system.
    Returns a list of paths to the unique PDF files.
    """
    def handle_os_error(err):
        print(f"OS error: {err}", file=sys.stderr)

    def get_start_directories() -> List[str]:
        """ Return appropriate start directories based on the operating system. """
        os_type = platform.system()
        if os_type == 'Windows':
            base_dir = os.environ.get('USERPROFILE', 'C:\\Users')
            return [os.path.join(base_dir, 'Documents')]
        elif os_type == 'Darwin':  # macOS
            return ['/Users']
        return ['/']

    def hash_file(filepath: str) -> str:
        """ Generate a SHA-256 hash of a file's contents. """
        hasher = hashlib.sha256()
        try:
            with open(filepath, 'rb') as f:
                buffer = f.read(65536)  # Read the file in chunks (e.g., 64kB)
                while len(buffer) > 0:
                    hasher.update(buffer)
                    buffer = f.read(65536)
        except IOError as e:
            print(f"Error reading file {filepath}: {str(e)}", file=sys.stderr)
            return None
        return hasher.hexdigest()

    def find_pdfs(directory: str, seen_hashes: set) -> List[str]:
        """ Recursively find all unique PDF files in the specified directory, handling permissions gracefully. """
        pdf_files = []
        for root, dirs, files in os.walk(directory, onerror=handle_os_error):
            for file in files:
                if file.lower().endswith('.pdf'):
                    full_path = os.path.join(root, file)
                    file_hash = hash_file(full_path)
                    if file_hash and file_hash not in seen_hashes:
                        seen_hashes.add(file_hash)
                        pdf_files.append(full_path)
        return pdf_files

    seen_hashes = set()
    pdfs = []
    start_directories = get_start_directories()
    for directory in start_directories:
        print(f"Scanning {directory} for PDF files...")
        pdfs.extend(find_pdfs(directory, seen_hashes))
    
    return pdfs

# Usage
pdf_files = find_pdfs_based_on_os()
print(f"Found {len(pdf_files)} unique PDFs based on content.")
for pdf in pdf_files[:10]:  # Print the first 10 PDF paths
    print(pdf)

Scanning /Users for PDF files...


OS error: [Errno 13] Permission denied: '/Users/yecao/.config/truffle'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Pictures/Photos Library.photoslibrary'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/CallHistoryTransactions'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/CloudDocs/session/db'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/com.apple.sharedfilelist'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/Knowledge'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/com.apple.TCC'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/FileProvider'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/AddressBook'
OS error: [Errno 1] Operation not permitted: '/Users/yecao/Library/Application Support/FaceTime'

Found 605 unique PDFs based on content.
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/135/Certifier Training Handouts.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/307/Position Update Process.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/61/Check I-9 Status.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/95/Finance Dashboard View Fund Balances.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/300/Cómo ingresar horas en Web Clock.pdf
/Users/yecao/Library/Application Support/pocketllm/user_workspace_cache/27fcc3e5-7bef-4a16-989e-eb38b3c9e5e6/model.ndb/documents/132/SPFF

In [7]:
import hashlib
import os
from typing import Dict, Set, List

class FileIdentity:
    def __init__(self, file_path: str, file_content_hash: str, combined_hash: str):
        self.file_path = file_path
        self.file_content_hash = file_content_hash
        self.combined_hash = combined_hash

class FileManager:
    def __init__(self, root_directory: str):
        self.root_directory = root_directory
        self.file_identities: Dict[str, FileIdentity] = {}
        
        # Initialize the file system
        self.update_manager()

    def update_manager(self):
        """Update the stored file identities based on the latest scan."""
        new_files = self.__scan_files()
        added_hashes = set(new_files.keys()) - set(self.file_identities.keys())
        removed_hashes = set(self.file_identities.keys()) - set(new_files.keys())

        added = {new_files[hash_key].file_path for hash_key in added_hashes}
        removed = {self.file_identities[hash_key].file_path for hash_key in removed_hashes}

        # Update the stored data
        for hash_key in removed_hashes:
            del self.file_identities[hash_key]
        self.file_identities.update(new_files)

        return {"added": list(added), "removed": list(removed)}
    
    def get_file_paths(self) -> List[str]:
        """Return a list of file paths currently tracked by the FileManager."""
        return [identity.file_path for identity in self.file_identities.values()]
    
    def __scan_files(self) -> Dict[str, FileIdentity]:
        """Scan the directory for PDF files and return a dictionary of new file identities."""
        new_files = {}
        for root, _, files in os.walk(self.root_directory):
            for file in files:
                if file.endswith(".pdf"):
                    file_path = os.path.join(root, file)
                    content_hash = FileManager.hash_file_content(file_path)
                    combined_hash = FileManager.hash_combined(file_path, content_hash)
                    new_files[combined_hash] = FileIdentity(file_path, content_hash, combined_hash)
        return new_files

    @staticmethod
    def hash_file_content(file_path: str) -> str:
        """Generate a hash for the contents of a file."""
        with open(file_path, 'rb') as file:
            hasher = hashlib.sha256()
            while chunk := file.read(4096):
                hasher.update(chunk)
            return hasher.hexdigest()

    @staticmethod
    def hash_combined(file_path: str, content_hash: str) -> str:
        """Generate a combined hash of the file path and content hash."""
        return hashlib.sha256(f"{file_path}{content_hash}".encode()).hexdigest()

In [8]:
file_manager = FileManager(root_directory="/Users/yecao/Downloads")

In [5]:
# Try again after deleting a file
changes = file_manager.update_manager()
print(changes)

{'added': ['/Users/yecao/Downloads/COMP140 copy.pdf'], 'removed': []}


In [16]:
from thirdai import licensing, neural_db as ndb

licensing.activate('C98090-AD14FC-9B52F0-AE3077-14DF5F-V3')

class FileIndexerManager:
    def __init__(self, file_manager: FileManager):
        self.file_manager = file_manager
        self.db = self.__init_db()

    def __init_db(self):
        """Initialize the database with existing PDF files."""
        initial_files = self.file_manager.get_file_paths()  # This is now a list of file paths.
        documents = []
        for path in initial_files:
            try:
                # Attempt to create a PDF document object
                document = ndb.PDF(path)
                documents.append(document)
            except Exception as e:
                # Handle the exception, log the error, and continue with the next file
                print(f"Error processing PDF {path}: {str(e)}")
                continue
        db = ndb.NeuralDB(low_memory = True)
        db.insert(sources=documents)
        return db

    def sync_file_index(self):
        """Synchronize the file system changes with the database index."""
        changes = self.file_manager.update_manager()
        if changes['added']:
            added_documents = [ndb.PDF(path) for path in changes['added'] if path.endswith(".pdf")]
            self.db.insert(sources=added_documents)
        if changes['removed']:
            removed_documents = [ndb.PDF(path) for path in changes['removed'] if path.endswith(".pdf")]
            self.db.delete(sources=removed_documents)



file_manager = FileManager(root_directory="/Users/yecao/Downloads")
file_indexer_manager = FileIndexerManager(file_manager)

# To perform synchronization:
# file_indexer_manager.sync_file_index()

In [None]:
# Example usage:
file_manager = FileManager(root_directory="/Users/yecao/Downloads")
db = ndb.NeuralDB()  # Assuming NeuralDB can be instantiated directly
file_indexer_manager = FileIndexerManager(file_manager, db)

# To perform synchronization:
file_indexer_manager.sync_file_index()

In [None]:
# In your FastAPI application
from fastapi import FastAPI

app = FastAPI()
file_manager = FileManager("/path/to/search")

@app.post("/update-files/")
def update_manager():
    changes = file_manager.update_manager()
    return changes