In [None]:
from PyPDF2 import PdfReader

In [None]:
# Document Schema

class Document():
    def __init__(self, pageContent, metadata, id):
        self.pageContent = pageContent # raw text from pdf
        self.metadata: dict = metadata # pdf metadata (author, date ...)
        self.id = id # pdf id

In [None]:
import re

def clean_document(document: str):
    document = re.sub(r"[-–—]\s*\d+\s*of\s*\d+\s*[-–—]", "", document) # Remove page numbers "- 1 of 10 -"
    document = re.sub(r"[ \t]+", " ", document) # Collapse multiple spaces and tabs
    document = re.sub(r"\n{3,}", "\n\n", document)
    document = document.strip() # Remove leading and trailing whitespaces
    print("Document cleaned")
    return document

In [None]:
# Constructing documents with metadata

def construct_document(pageContent, source, info, metadata, numPages, pageNumber):
    doc = Document(
        pageContent=pageContent,
        metadata={
            "source": source,
            "pdf": {
                "info": info,
                "metadata": metadata,
                "numPages": numPages,
            },
            "loc": {
                "pageNumber": pageNumber
            }
        },
        id=None
    )
    return doc

In [None]:
import requests
from io import BytesIO

def extract_text(url, splitpages=False):

    response = requests.get(url) # Download Pdf
    response.raise_for_status() # error checking

    parser = PdfReader(BytesIO(response.content))
    meta = parser.metadata if parser.metadata else {}
    num_pages = len(parser.pages)
    doc = []

    if splitpages: # split pages 
        print("Splitting Pages")
        for i in range(num_pages):
            raw_text = parser.pages[i].extract_text() or ""
            cleaned_text = clean_document(raw_text)

            doc.append(construct_document(
                cleaned_text,
                url,
                None,
                meta,
                num_pages,
                i+1
            ))
    else:
        print("Split Pages False")
        pages = [page.extract_text() or "" for page in parser.pages]
        raw_text = "\n\n".join(pages)
        cleaned_text = clean_document(raw_text)

        doc.append(construct_document(
            cleaned_text,
            url,
            None,
            meta,
            num_pages,
            None
        ))

    return doc


In [15]:
url = "https://arxiv.org/pdf/2402.19473"
text = extract_text(url)
print(repr(text))

Document cleaned
[<__main__.Document object at 0x000002AD04B9DF90>]
