### Lower case document content

In [9]:
def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()

### Removes non ASCII characters

In [10]:
import re

def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)

### Removes bulleted and numbered lists

In [11]:
import re

def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•\-*✔●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)

### Removes multiple consecutive escape characters

In [12]:
def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())

### Save processed chunks in JSON format

In [13]:
from langchain_core.load import dumpd
import json

def save_chunks(pages: list, type: str):

    """Saves on disk each processed chunk (raw or cleaned), given a flag
    * type = "r" -> raw chunks
    * type = "c" -> cleaned chunks """

    path = "parsed_documents/PyMuPDFLoader - No OCR"
    full_path = ""

    match type:
        case "r":
            full_path = path + "/raw/chunk_"
        case "c":
            full_path = path + "/cleaned/chunk_"
        case _:
            return

    for chunk in range(len(pages)):
        current_path = full_path + str(chunk + 1)
        with open(current_path, "w") as ser_file:
            page_d = dumpd(pages[chunk])
            json.dump(page_d, ser_file)

### Load processed chunks (raw, cleaned)

In [2]:
import os
import json
from langchain_core.load import load

def load_chunks(type: str = "r"):

    """Retrieves all processed (raw or cleaned) chunks, given a flag
    * type = "r" -> raw chunks
    * type = "c" -> cleaned chunks """

    path = "parsed_documents/PyMuPDFLoader - No OCR"
    full_path = ""
    pages = []

    match type:
        case "r": 
            full_path = path + "/raw"
        case "c":
            full_path = path + "/cleaned"
        case _:
            return None
            
    for fname in os.listdir(full_path):
        f = os.path.join(full_path, fname)
        with open(f, "r") as file:
            page = load(json.load(file))
            pages.append(page)
    
    return pages

### Get all file from source folder to be processed

In [3]:
import os

processed_files = [
    'BEQ_2301_OVERALL_multi.pdf', 
    'CADCAM_BRA_22_Eng.pdf', 
    'IOS_Report_FR-IT-ES_rev17.pdf', 
    'OMNI_DIGITAL_EU_15_CLI_.pdf', 
    'OMNI_DIGITAL_EU_15_CLI_LAB_Executive_Summary_.pdf', 
    'OMNI_DIGITAL_EU_15_LAB_.pdf', 
    'OMNI_DIGITAL_EU_21_CLI_LAB_INTEGRATED_.pdf', 
    'OMNI-DIGITAL_ITA_17_CLI_.pdf', 
    'OMNI-DIGITAL_ITA_23_CLI_.pdf', 
    'OMNI_DIGITAL_ITA_19_CLI_LAB_INTEGRATED_.pdf', 
    'OMNI_DIGITAL_SPA_19_CLI_.pdf', 
    'OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf', 
    'OMNI_DIGITAL_SPA_19_LAB_.pdf'
]

folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        if fname not in processed_files:
            files.append(complete_path)

### Parse documents and tables within into pages

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader

pages = load_chunks("r")
cleaned_pages = load_chunks("c")

# Index new documents  
new_pages = []
for file in files:
    loader = PyMuPDFLoader(file, extract_images = False)
    async for page in loader.alazy_load():
        new_pages.append(page)

### Pre-process content (text cleaning)

In [21]:
import copy

new_cleaned_pages = copy.deepcopy(new_pages)

remove_non_ASCII(new_cleaned_pages)
decapitalize_content(new_cleaned_pages)
remove_bullets(new_cleaned_pages)
remove_escape(new_cleaned_pages)

### Merge loaded pages with new indexed ones

In [22]:
pages += new_pages
cleaned_pages += new_cleaned_pages

### Serialize pages

In [23]:
save_chunks(pages, "r")
save_chunks(cleaned_pages, "c")

# Update processed files list
processed_files += [file.replace(folder + "/", "") for file in files]

### Create several embedding models

In [9]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

llama1b_embeddings = OllamaEmbeddings(model = "llama3.2:1b")
llama3b_embeddings = OllamaEmbeddings(model = "llama3.2:3b")
gemma2b_embeddings = OllamaEmbeddings(model = "gemma2:2b")
mpnetbase_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
minilm_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")



all_embeddings = {
    "llama1b": OllamaEmbeddings(model = "llama3.2:1b"), 
    "llama3b": OllamaEmbeddings(model = "llama3.2:3b"),
    "gemma2b": OllamaEmbeddings(model = "gemma2:2b"),
    "mpnetbase": HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"),
    "minilm": HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2"),
}

### Create a vector store

In [17]:
from langchain_core.vectorstores import InMemoryVectorStore
import os

embeddings = all_embeddings["llama3b"]
vector_store_path = "embeddings/Llama3.2:3b/raw/Llama3.2:3b_raw"
vector_store = None

if os.path.exists(vector_store_path):
    vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)

else: 
    vector_store = InMemoryVectorStore.from_documents(documents = pages, embedding = embeddings)
    vector_store.dump(vector_store_path)