In [12]:
from pathlib import Path
from abc import ABC, abstractmethod
from typing import List
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

PROJECT_ROOT = Path("/home/troligen/development/hobby/ML/MuRAG/")


class BaseDocumentProcessor(ABC):
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
    
    @abstractmethod
    def load_document(self, file_path: Path) -> str:
        pass
    
    def process_document(self, file_path: Path) -> List[Document]:
        text = self.load_document(file_path)
        chunks = self.text_splitter.split_text(text)
        return [Document(page_content=chunk, metadata={"source": str(file_path)}) for chunk in chunks]

class TextDocumentProcessor(BaseDocumentProcessor):
    def load_document(self, file_path: Path) -> str:
        with file_path.open('r', encoding='utf-8') as file:
            return file.read()

class PDFDocumentProcessor(BaseDocumentProcessor):
    def load_document(self, file_path: Path) -> str:
        from pypdf import PdfReader
        
        reader = PdfReader(str(file_path))
        return " ".join(page.extract_text() for page in reader.pages)

class DocxDocumentProcessor(BaseDocumentProcessor):
    def load_document(self, file_path: Path) -> str:
        import docx2txt
        return docx2txt.process(str(file_path))

class DocumentProcessorFactory:
    @staticmethod
    def get_processor(file_path: Path) -> BaseDocumentProcessor:
        if file_path.suffix.lower() == '.pdf':
            return PDFDocumentProcessor()
        elif file_path.suffix.lower() == '.docx':
            return DocxDocumentProcessor()
        else:
            return TextDocumentProcessor()

def process_document(relative_path: str) -> List[Document]:
    file_path = PROJECT_ROOT / relative_path
    processor = DocumentProcessorFactory.get_processor(file_path)
    return processor.process_document(file_path)



In [13]:
test_file = "test_data/science_pdf/1-s2.0-S2212096323000189-main.pdf"  
chunks = process_document(test_file)
print(f"Number of chunks: {len(chunks)}")
if chunks:
    print(f"First chunk: {chunks[0].page_content[:100]}...")  # Print first 100 characters of the first chunk
else:
    print("No chunks were generated. Check if the file exists and is readable.")

Number of chunks: 90
First chunk: Climate Risk Management 40 (2023) 100492
Available online 25 February 2023
2212-0963/Â© 2023 The Auth...
