### Lower case document content

In [9]:
def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()

### Removes non ASCII characters

In [10]:
import re

def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)

### Removes bulleted and numbered lists

In [11]:
import re

def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•\-*✔●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)

### Removes multiple consecutive escape characters

In [12]:
def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())

### Save processed chunks in JSON format

In [13]:
from langchain_core.load import dumpd
import json

def save_chunks(pages: list, type: str):

    """Saves on disk each processed chunk (raw or cleaned), given a flag
    * type = "r" -> raw chunks
    * type = "c" -> cleaned chunks """

    path = "parsed_documents/PyMuPDFLoader - No OCR"
    full_path = ""

    match type:
        case "r":
            full_path = path + "/raw/chunk_"
        case "c":
            full_path = path + "/cleaned/chunk_"
        case _:
            return

    for chunk in range(len(pages)):
        current_path = full_path + str(chunk + 1)
        with open(current_path, "w") as ser_file:
            page_d = dumpd(pages[chunk])
            json.dump(page_d, ser_file)

### Load processed chunks (raw, cleaned)

In [2]:
import os
import json
from langchain_core.load import load

def load_chunks(type: str = "r"):

    """Retrieves all processed (raw or cleaned) chunks, given a flag
    * type = "r" -> raw chunks
    * type = "c" -> cleaned chunks """

    path = "parsed_documents/PyMuPDFLoader - No OCR"
    full_path = ""
    pages = []

    match type:
        case "r": 
            full_path = path + "/raw"
        case "c":
            full_path = path + "/cleaned"
        case _:
            return None
            
    for fname in os.listdir(full_path):
        f = os.path.join(full_path, fname)
        with open(f, "r") as file:
            page = load(json.load(file))
            pages.append(page)
    
    return pages

### Get all file from source folder to be processed

In [3]:
import os

processed_files = [
    'BEQ_2301_OVERALL_multi.pdf', 
    'CADCAM_BRA_22_Eng.pdf', 
    'IOS_Report_FR-IT-ES_rev17.pdf', 
    'OMNI_DIGITAL_EU_15_CLI_.pdf', 
    'OMNI_DIGITAL_EU_15_CLI_LAB_Executive_Summary_.pdf', 
    'OMNI_DIGITAL_EU_15_LAB_.pdf', 
    'OMNI_DIGITAL_EU_21_CLI_LAB_INTEGRATED_.pdf', 
    'OMNI-DIGITAL_ITA_17_CLI_.pdf', 
    'OMNI-DIGITAL_ITA_23_CLI_.pdf', 
    'OMNI_DIGITAL_ITA_19_CLI_LAB_INTEGRATED_.pdf', 
    'OMNI_DIGITAL_SPA_19_CLI_.pdf', 
    'OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf', 
    'OMNI_DIGITAL_SPA_19_LAB_.pdf'
]

folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        if fname not in processed_files:
            files.append(complete_path)

### Parse documents and tables within into pages

In [54]:
from langchain_community.document_loaders import PyMuPDFLoader

pages = load_chunks("r")
cleaned_pages = load_chunks("c")

# Index new documents  
new_pages = []
for file in files:
    loader = PyMuPDFLoader(file, extract_images = False)
    async for page in loader.alazy_load():
        new_pages.append(page)

### Pre-process content (text cleaning)

In [21]:
import copy

new_cleaned_pages = copy.deepcopy(new_pages)

remove_non_ASCII(new_cleaned_pages)
decapitalize_content(new_cleaned_pages)
remove_bullets(new_cleaned_pages)
remove_escape(new_cleaned_pages)

### Merge loaded pages with new indexed ones

In [22]:
pages += new_pages
cleaned_pages += new_cleaned_pages

### Serialize pages

In [23]:
save_chunks(pages, "r")
save_chunks(cleaned_pages, "c")

# Update processed files list
processed_files += [file.replace(folder + "/", "") for file in files]

### Create several embedding models

In [109]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

all_embeddings = {
    "llama1b": 
    {
        "model": OllamaEmbeddings(model = "llama3.2:1b"),
        "raw-path": "embeddings/llama3.2:1b/raw/llama3.2:1b_raw",
        "cleaned-path": "embeddings/llama3.2:1b/cleaned/llama3.2:1b_cleaned"
    }, 
    "llama3b": 
    {
        "model": OllamaEmbeddings(model = "llama3.2:3b"),
        "raw-path": "embeddings/llama3.2:3b/raw/llama3.2:3b_raw",
        "cleaned-path": "embeddings/llama3.2:3b/cleaned/llama3.2:3b_cleaned"
    },
    "gemma2b": 
    {
        "model": OllamaEmbeddings(model = "gemma2:2b"),
        "raw-path": "embeddings/gemma2b/raw/gemma2b_raw",
        "cleaned-path": "embeddings/gemma2b/cleaned/gemma2b_cleaned"
    },
    "mpnetbase": 
    {
        "model": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
        "raw-path": "embeddings/mpnet_base_v2/raw/mpnet_base_v2_raw",
        "cleaned-path": "embeddings/mpnet_base_v2/cleaned/mpnet_base_v2_cleaned"
    },
    "minilm": 
    {
        "model": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-l6-v2"),
        "raw-path": "embeddings/minilm/raw/minilm_raw",
        "cleaned-path": "embeddings/minilm/cleaned/minilm_cleaned"
    },
}

### Create a vector store

In [122]:
from langchain_core.vectorstores import InMemoryVectorStore
import os

embeddings = all_embeddings["minilm"]["model"]
vector_store_path = all_embeddings["minilm"]["raw-path"]
vector_store = None

if os.path.exists(vector_store_path):
    vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)

else: 
    vector_store = InMemoryVectorStore.from_documents(documents = cleaned_pages, embedding = embeddings)
    vector_store.dump(vector_store_path)

### Define queries

In [117]:
products = ["Intra oral scanner", "3D printer"]
producers = ["Dentsply Sirona", "Kavo", "3M", "GC", "Ivoclar", "Straumann", "Kulzer", "Voco"]
intervals = [1, 2, 3, 4]
countries = ["Italy", "Germany", "Spain", "UK", "United Kingdom", "Brazil"]

all_queries = [
    "Trend of inflation in the dental sector between 2021, 2022, and the first half of 2023",
    "Dental product brands that offer the best value for money according to dentists",
    "Which are the most relevant dental brands?",
    "Which are the most recommended products?",
    "What are the preferred purchasing channels in different countries?",
    f"Evolution of {products[0]} adoption",
    f"Which is the country where {products[0]} is most successful?",
    f"Evolution of {producers[0]}'s loyalitization capability",
    f"Evolution of {products[0]}'s market in the last {intervals[2]} years",
    f"Difference in {products[1]} adoption between {countries[0]} and {countries[4]}",
]

### Retrieve documents

In [132]:
query = all_queries[9]

docs_relevances = vector_store.similarity_search_with_score(query, k = 10)

print("Query: " + query + "\n")
for tuple in docs_relevances:
    print(f"{tuple[1]:.4f}")
for tuple in docs_relevances:
    print(tuple[0].page_content)

Query: Difference in 3D printer adoption between Italy and United Kingdom

0.6310
0.6115
0.5996
0.5537
0.5432
0.5398
0.5349
0.5334
0.5305
0.5304
3D Printers penetration
Do you have a 3D printer in your dental practice?
29
Past surveys carried out by Key-Stone on 3D printing suggest a quick development of the adoption of this 
technology in dental practices.
As regards the analysis on the year of purchase, especially in more recent years a high procurement activity 
has been made.
It is worth to underline the higher-than-average penetration in Germany and UK, while a lower-than-average 
intention to buy can be noted in Italy and France.
Sales of 3D printers have shown a steep increase in recent years (75% of purchases have been made since 
2019), mainly due to improvements in precision, production times and function of use. 
CLINIC
Year of purchase
AVERAGE: 2019
Base: 1.061 cases
Until a few years ago 3D printers were used exclusively for printing individual 
impression trays and surgic