### Lower case document content

In [9]:
def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()

### Removes non ASCII characters (TO DO: apply only if document language = english)

In [10]:
import re

def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)

### Removes bulleted and numbered lists

In [11]:
import re

def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•\-*✔●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)

### Removes multiple consecutive escape characters

In [12]:
def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())

### Save processed chunks in JSON format

In [11]:
from langchain_core.load import dumpd
import json
import os

def save_chunks(pages: list, path: str):

    # path = "parsed_documents/PyMuPDFLoader - No OCR"
    # full_path = ""

    # match type:
    #     case "r":
    #         full_path = path + "/raw/chunk_"
    #     case "c":
    #         full_path = path + "/cleaned/chunk_"
    #     case _:
    #         return
    if not os.path.exists(path):
        os.mkdir(path)
    for chunk in range(len(pages)):
        full_path = path + "/" + "chunk_" + str(chunk + 1)
        with open(full_path, "w") as ser_file:
            page_d = dumpd(pages[chunk])
            json.dump(page_d, ser_file)

### Load processed chunks (raw, cleaned)

In [10]:
import os
import json
from langchain_core.load import load

def load_chunks(path: str):

    # path = "parsed_documents/PyMuPDFLoader - No OCR"
    # full_path = ""
    pages = []
    # match type:
    #     case "r": 
    #         full_path = path + "/raw"
    #     case "c":
    #         full_path = path + "/cleaned"
    #     case _:
    #         return None

    try:   
        for fname in os.listdir(path):
            f = os.path.join(path, fname)
            with open(f, "r") as file:
                page = load(json.load(file))
                pages.append(page)
    
    except FileNotFoundError:
        return []
    
    return pages

### Get all file from source folder to be processed

In [105]:
import os

# processed_files = [
#     'BEQ_2301_OVERALL_multi.pdf', 
#     'CADCAM_BRA_22_Eng.pdf', 
#     'IOS_Report_FR-IT-ES_rev17.pdf', 
#     'OMNI_DIGITAL_EU_15_CLI_.pdf', 
#     'OMNI_DIGITAL_EU_15_CLI_LAB_Executive_Summary_.pdf', 
#     'OMNI_DIGITAL_EU_15_LAB_.pdf', 
#     'OMNI_DIGITAL_EU_21_CLI_LAB_INTEGRATED_.pdf', 
#     'OMNI-DIGITAL_ITA_17_CLI_.pdf', 
#     'OMNI-DIGITAL_ITA_23_CLI_.pdf', 
#     'OMNI_DIGITAL_ITA_19_CLI_LAB_INTEGRATED_.pdf', 
#     'OMNI_DIGITAL_SPA_19_CLI_.pdf', 
#     'OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf', 
#     'OMNI_DIGITAL_SPA_19_LAB_.pdf'
# ]
processed_files = []

folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        if fname not in processed_files:
            files.append(complete_path)

### Create text splitters

In [106]:
from langchain_text_splitters import TokenTextSplitter

chunking_type = "fixed_number"
chunk_size = 384
chunk_overlap = 100

all_chunkings = {
    "page_chunking": None,
    "fixed_number": TokenTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
}

### Parse documents and tables within into pages

In [107]:
from langchain_community.document_loaders import PyMuPDFLoader

path = "chunkings/No OCR/"

match chunking_type:
    case "page_chunking":
        path = path + chunking_type + "/raw" 

    case "fixed_number":
        path = path + str(chunk_size) + "_" + str(chunk_overlap)

    case _:
        pass
    
pages = load_chunks(path)

text_splitter = all_chunkings[chunking_type]

# Index new documents  
new_pages = []

for file in files:
    loader = PyMuPDFLoader(file, extract_images = False)

    if chunking_type == "page_chunking":
        async for page in loader.alazy_load():
            new_pages.append(page)
            
    else:
        for page in loader.load_and_split(text_splitter = text_splitter):
            new_pages.append(page)

In [108]:
print(len(new_pages))

1282


In [109]:
from statistics import mean

print(mean([len(p.page_content) for p in new_pages]))

661.2316692667707


### Pre-process content (text cleaning)

In [21]:
import copy

new_cleaned_pages = copy.deepcopy(new_pages)

remove_non_ASCII(new_cleaned_pages)
decapitalize_content(new_cleaned_pages)
remove_bullets(new_cleaned_pages)
remove_escape(new_cleaned_pages)

### Merge new pages with existing ones

In [110]:
pages += new_pages
# cleaned_pages += new_cleaned_pages

### Serialize pages

In [111]:
path = "chunkings/No OCR/" + str(chunk_size) + "_" + str(chunk_overlap)
save_chunks(pages, path)

# Update processed files list
processed_files += [file.replace(folder + "/", "") for file in files]

### Create embedding models

In [18]:
# from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

chunking = chunking_type + "_" + str(chunk_overlap)

all_embeddings = {
    # "llama1b": 
    # {
    #     "model": OllamaEmbeddings(model = "llama3.2:1b"),
    #     "raw-path": "models/No OCR/llama3.2:1b/raw/llama3.2:1b_raw",
    #     "cleaned-path": "models/No OCR/llama3.2:1b/cleaned/llama3.2:1b_cleaned"
    # }, 
    # "llama3b": 
    # {
    #     "model": OllamaEmbeddings(model = "llama3.2:3b"),
    #     "raw-path": "models/No OCR/llama3.2:3b/raw/llama3.2:3b_raw",
    #     "cleaned-path": "models/No OCR/llama3.2:3b/cleaned/llama3.2:3b_cleaned"
    # },
    # "gemma2b": 
    # {
    #     "model": OllamaEmbeddings(model = "gemma2:2b"),
    #     "raw-path": "models/No OCR/gemma2b/raw/gemma2b_raw",
    #     "cleaned-path": "models/No OCR/gemma2b/cleaned/gemma2b_cleaned"
    # },
    "mpnetbase": 
    {
        "model": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
        "raw-path": "models/No OCR/mpnet_base_v2/" + chunking + "/mpnet_base_v2",
        "cleaned-path": "models/No OCR/mpnet_base_v2/" + chunking + "/mpnet_base_v2_cleaned"
    },
    "minilm-l6": 
    {
        "model": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-l6-v2"),
        "raw-path": "models/No OCR/minilm-l6/" + chunking + "/minilm_l6_raw",
        "cleaned-path": "models/No OCR/minilm-l6/" + chunking + "/minilm_l6_cleaned"
    },
    "minilm-l12":
    {
        "model": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2"),
        "raw-path": "models/No OCR/minilm-l12/" + chunking + "/minilm_l12_raw",
        "cleaned-path": "models/No OCR/minilm-l12/" + chunking + "/minilm_l12_cleaned",
    }
}

### Create a vector store

In [34]:
from langchain_core.vectorstores import InMemoryVectorStore
import os

model_name = "mpnetbase"
embeddings = all_embeddings[model_name]["model"]
vector_store_path = all_embeddings[model_name]["cleaned-path"]
vector_store = None

if os.path.exists(vector_store_path):
    vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)

else: 
    vector_store = InMemoryVectorStore.from_documents(documents = pages, embedding = embeddings)
    vector_store.dump(vector_store_path)

### Define queries

In [11]:
products = ["Intra oral scanner", "3D printer"]
producers = ["Dentsply Sirona", "Kavo", "3M", "GC", "Ivoclar", "Straumann", "Kulzer", "Voco"]
intervals = [1, 2, 3, 4]
countries = ["Italy", "Germany", "Spain", "UK", "United Kingdom", "Brazil"]

all_queries = [
    "Trend of inflation in the dental sector between 2021, 2022, and the first half of 2023",
    "Dental product brands that offer the best value for money according to dentists",
    "Which are the most relevant dental brands?",
    "Which are the most recommended products?",
    "What are the preferred purchasing channels in different countries?",
    f"Evolution of {products[1]} adoption",
    f"Which is the country where {products[0]} is most successful?",
    f"Evolution of {producers[0]}'s loyalitization capability",
    f"Evolution of {products[0]}'s market in the last {intervals[2]} years",
    f"Difference in {products[1]} adoption between {countries[0]} and {countries[4]}",
]

### Retrieve documents

In [None]:
query = all_queries[5]
docs = vector_store.similarity_search_with_score(query, k = 4)

print("Query: " + query)
for doc in docs:
    print(doc[1])
    print("Sorgente: " + doc[0].metadata["source"] + ", pagina: " + str(doc[0].metadata["page"]))
    print(doc[0].page_content + "\n")

### Evaluate model on a list of queries

In [None]:
from statistics import mean, variance

all_scores = []

for query in all_queries:
    docs_relevances = vector_store.similarity_search_with_score(query, k = 10)
    all_scores.append(docs_relevances[0][1])

avg = mean(all_scores)
var = variance(all_scores, avg)

print(f"{avg:.3f}, {var:.3f}")

### Compute number of different retrieved chunks between two models

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

model_name_1 = "minilm-l6"
model_name_2 = "minilm-l12"
embeddings_1 = all_embeddings[model_name_1]["model"]
embeddings_2 = all_embeddings[model_name_2]["model"]

vector_store_path_1 = all_embeddings[model_name_1]["cleaned-path"]
vector_store_path_2 = all_embeddings[model_name_2]["cleaned-path"]

vector_store_1 = InMemoryVectorStore.load(path = vector_store_path_1, embedding = embeddings_1)
vector_store_2 = InMemoryVectorStore.load(path = vector_store_path_2, embedding = embeddings_2)

diff = 0

for query in all_queries:

    chunks_1 = vector_store_1.similarity_search(query, k = 4)
    chunks_2 = vector_store_2.similarity_search(query, k = 4)

    chunks_mapped_1 = list(map(lambda c: (c.metadata["source"], c.metadata["page"]), chunks_1))
    chunks_mapped_2 = list(map(lambda c: (c.metadata["source"], c.metadata["page"]), chunks_2))

    diff += sum([1 for tuple in chunks_mapped_1 if tuple not in chunks_mapped_2])

print(diff)