### Lower case document content

In [116]:
def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()

### Removes non ASCII characters

In [117]:
import re

def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)

### Removes bulleted and numbered lists

In [118]:
import re

def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•\-*✔●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)

### Removes multiple consecutive escape characters

In [120]:
def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())

### Load processed chunks (raw, cleaned)

In [None]:
def load_chunks(type: str = "r"):

### Get all file from source folder to be processed

In [171]:
import os

ENABLE_LOAD = False
# processed_files = [
#     'BEQ_2301_OVERALL_multi.pdf', 
#     'CADCAM_BRA_22_Eng.pdf', 
#     'IOS_Report_FR-IT-ES_rev17.pdf', 
#     'OMNI_DIGITAL_EU_15_CLI_.pdf', 
#     'OMNI_DIGITAL_EU_15_CLI_LAB_Executive_Summary_.pdf', 
#     'OMNI_DIGITAL_EU_15_LAB_.pdf', 
#     'OMNI_DIGITAL_EU_21_CLI_LAB_INTEGRATED_.pdf', 
#     'OMNI-DIGITAL_ITA_17_CLI_.pdf', 
#     'OMNI-DIGITAL_ITA_23_CLI_.pdf', 
#     'OMNI_DIGITAL_ITA_19_CLI_LAB_INTEGRATED_.pdf', 
#     'OMNI_DIGITAL_SPA_19_CLI_.pdf', 
#     'OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf', 
#     'OMNI_DIGITAL_SPA_19_LAB_.pdf'
# ]
processed_files = []

folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        if fname not in processed_files or not ENABLE_LOAD:
            files.append(complete_path)

### Parse documents and tables within into pages

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
import json
from langchain_core.load import load

pages = []
cleaned_pages = []

if ENABLE_LOAD:
    
    path_raw = "parsed_documents/PyMuPDFLoader - No OCR/raw"
    path_cleaned = "parsed_documents/PyMuPDFLoader - No OCR/cleaned"

    # Retrieve processed raw chunks
    for fname in os.listdir(path_raw):
        f = os.path.join(path_raw, fname)
        with open(f, "r") as file:
            page_ = json.load(file)
            page = load(page_)
            pages.append(page)

    # and cleaned chunks
    for fname in os.listdir(path_cleaned):
        f = os.path.join(path_cleaned, fname)
        with open(f, "r") as file:
            page_ = json.load(file)
            page = load(page_)
            pages.append(page)


# Index new documents  
for file in files:
    loader = PyMuPDFLoader(file, extract_images = False)
    async for page in loader.alazy_load():
        pages.append(page)

print(len(pages))

# Update processed files list
for file in files:
    processed_files.append(file.replace(folder + "/", ""))

### Pre-process content (text cleaning)

In [106]:
import copy

cleaned_pages = copy.deepcopy(pages)

remove_non_ASCII(cleaned_pages)
decapitalize_content(cleaned_pages)
remove_bullets(cleaned_pages)
remove_escape(cleaned_pages)

### Serialize pages

In [114]:
from langchain_core.load import dumpd
import json

path_raw = "parsed_documents/PyMuPDFLoader - No OCR/raw/chunk_"
path_cleaned = "parsed_documents/PyMuPDFLoader - No OCR/cleaned/chunk_"

for chunk in range(len(pages)):

    current_path_raw = path_raw + str(chunk + 1)
    current_path_cleaned = path_cleaned + str(chunk + 1)
    page = pages[chunk]

    with open(current_path_raw, "w") as ser_file:
        page_d = dumpd(pages[chunk])
        json.dump(page_d, ser_file)

    with open(current_path_cleaned, "w") as ser_file:
        page_d = dumpd(cleaned_pages[chunk])
        json.dump(page_d, ser_file)

### Parse content with Unstructured

In [None]:
from langchain_unstructured import UnstructuredLoader

loader = UnstructuredLoader(
    file_path = "../sources/BEQ_2301_OVERALL_multi.pdf", 
    strategy = "hi_res")
docs = []
async for doc in loader.alazy_load():
    docs.append(doc)