### Save processed chunks in JSON format

In [2]:
from langchain_core.load import dumpd
import json
import os

def save_chunks(pages: list, path: str):

    if not os.path.exists(path):
        os.mkdir(path)
    for chunk in range(len(pages)):
        full_path = path + "/" + "chunk_" + str(chunk + 1)
        with open(full_path, "w") as ser_file:
            page_d = dumpd(pages[chunk])
            json.dump(page_d, ser_file)

### Load processed chunks (raw, cleaned)

In [1]:
import os
import json
from langchain_core.load import load

def load_chunks(path: str):

    pages = []

    try:   
        for fname in os.listdir(path):
            f = os.path.join(path, fname)
            with open(f, "r") as file:
                page = load(json.load(file))
                pages.append(page)
    
    except FileNotFoundError:
        return []
    
    return pages

### Alter metadata (this step is document-specific and may change)

In [None]:
import pikepdf

# This document is written in Spanish, so we decide to mark it into metadata
pdf = pikepdf.Pdf.open("../sources/OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf",
                  allow_overwriting_input = True)

edited = False  
with pdf.open_metadata() as meta:
    if meta["keywords"] == "":
        meta["keywords"] = "non-en"
        edited = True
        
if edited:  
    pdf.save("../sources/OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf")

### Get all file from source folder to be processed

In [3]:
import os


# processed_files = [
#     'BEQ_2301_OVERALL_multi.pdf', 
#     'CADCAM_BRA_22_Eng.pdf', 
#     'IOS_Report_FR-IT-ES_rev17.pdf', 
#     'OMNI_DIGITAL_EU_15_CLI_.pdf', 
#     'OMNI_DIGITAL_EU_15_CLI_LAB_Executive_Summary_.pdf', 
#     'OMNI_DIGITAL_EU_15_LAB_.pdf', 
#     'OMNI_DIGITAL_EU_21_CLI_LAB_INTEGRATED_.pdf', 
#     'OMNI-DIGITAL_ITA_17_CLI_.pdf', 
#     'OMNI-DIGITAL_ITA_23_CLI_.pdf', 
#     'OMNI_DIGITAL_ITA_19_CLI_LAB_INTEGRATED_.pdf', 
#     'OMNI_DIGITAL_SPA_19_CLI_.pdf', 
#     'OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf', 
#     'OMNI_DIGITAL_SPA_19_LAB_.pdf'
# ]
processed_files = []

folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        if fname not in processed_files:
            files.append(complete_path)

### Create text splitters

In [40]:
from langchain_text_splitters import TokenTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings


chunking_type = "fixed_number"
chunk_size = 384
chunk_overlap = 100
semantic_chunking_type = "gradient"
semantic_chunking_model = "sentence-transformers/all-MiniLM-L12-v2"
embeddings = HuggingFaceEmbeddings(model_name = semantic_chunking_model)

all_chunkings = {
    "page_chunking": None,
    "fixed_number": TokenTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap),
    "semantic": SemanticChunker(embeddings = embeddings, 
                                breakpoint_threshold_type = semantic_chunking_type)
}

### Parse documents and tables within into pages

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document
from tqdm.auto import tqdm


async def split_page_content(files):

    docs = []
    for file in files:
        loader = PyMuPDFLoader(file)
        async for doc in loader.alazy_load():
            docs.append(doc)

    return docs


async def merge_page_content_and_split(files, splitter):

    docs = []
    page_contents = []
    giant_docs = []

    for file in files:
        loader = PyMuPDFLoader(file)
        giant_doc = {"page_content": "", "metadata": ""}
        first = True
        async for doc in loader.alazy_load():
            if first:
                metadata = {k: v for k, v in doc.metadata.items() if k != "page"}
                giant_doc["metadata"] = metadata
            giant_doc["page_content"] += doc.page_content
            first = False
        giant_docs.append(giant_doc)

    for gdoc in tqdm(giant_docs):
        page_contents = splitter.split_text(gdoc["page_content"])
        docs += [{"metadata": gdoc["metadata"], "page_content": pc} for pc in page_contents]

    docs = [Document(metadata = doc["metadata"], page_content = doc["page_content"]) for doc in docs]

    return docs


path = "chunkings/No OCR/"

if chunking_type == "page_chunking":
    path = path + chunking_type + "/cleaned" 
elif chunking_type == "semantic":
    path = f"{path}{chunking_type}_{semantic_chunking_type}_minilm_l12"
else:
    path = f"{path}{chunk_size}_{chunk_overlap}"
    
pages = load_chunks(path)

text_splitter = all_chunkings[chunking_type]

# Index new documents  
if chunking_type == "page_chunking":
    new_pages = await split_page_content(files)

else:
    new_pages = await merge_page_content_and_split(files, text_splitter)


In [None]:
print(len(pages))
print(len(new_pages))
path

### Pre-process content (text cleaning)

In [60]:
import re


def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()


def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        if "non-en" not in p.metadata["keywords"]:
            p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)


def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•▪\-*✔➢●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)


def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())


remove_non_ASCII(new_pages)
decapitalize_content(new_pages)
remove_bullets(new_pages)
remove_escape(new_pages)

### Merge new pages with existing ones and serialize

In [61]:
pages += new_pages

save_chunks(pages, path)

# Update processed files list
# processed_files += [file.replace(folder + "/", "") for file in files]

### Create embedding models

In [42]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

all_embeddings = {
    # "llama3.2:1b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    # "llama3.2:3b": OllamaEmbeddings(model = "llama3.2:3b"),
     
    # "gemma2b": OllamaEmbeddings(model = "llama3.2:1b"),
     
    # "mpnet_base_v2": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"),
     
    # "minilm_l6": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2"),
    
    # "minilm_l12": HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L12-v2"),

    # "multilingual": HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-large"),
    
    "text_embedding_3_large": AzureOpenAIEmbeddings(
        azure_endpoint="https://keystone1.openai.azure.com/openai/deployments/text-embedding-3-large-2/embeddings?api-version=2023-05-15",
        api_key = os.environ["OPENAI_API_KEY"],
        model = "TextEmbedding3LargeDeployment",
        api_version = "2023-05-15",
        show_progress_bar = True,
        chunk_size = 384
)
}

### Create a vector store

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
import os


path = "chunkings/No OCR/"
model_name = "text_embedding_3_large"

if chunking_type == "page_chunking":
    chunking = chunking_type
    path = f"{path}{chunking_type}/cleaned"

elif chunking_type == "semantic":
    chunking = f"{chunking_type}_{semantic_chunking_type}_{model_name}"
    path = f"{path}{chunking_type}_{semantic_chunking_type}_{model_name}"
    
else:
    chunking = str(chunk_size) + "_" + str(chunk_overlap)
    path = f"{path}{chunk_size}_{chunk_overlap}"

pages = load_chunks(path)

embeddings = all_embeddings[model_name]
vector_store_path = f"models/No OCR/{model_name}/{chunking}/{model_name}"
vector_store = None

if os.path.exists(vector_store_path):
    vector_store = InMemoryVectorStore.load(path = vector_store_path, embedding = embeddings)

else: 
    vector_store = InMemoryVectorStore.from_documents(documents = pages, embedding = embeddings)
    vector_store.dump(vector_store_path)

retriever = vector_store.as_retriever(search_type = "similarity", 
                                      search_kwargs = {"k": 4})


### Define queries

In [7]:
products = ["Intra oral scanner", "3D printer"]
producers = ["Dentsply Sirona", "Kavo", "3M", "GC", "Ivoclar", "Straumann", "Kulzer", "Voco"]
intervals = [1, 2, 3, 4]
countries = ["Italy", "Germany", "Spain", "UK", "United Kingdom", "Brazil"]

all_queries = [
    "Trend of inflation in the dental sector between 2021, 2022, and the first half of 2023",
    "Dental product brands that offer the best value for money according to dentists",
    "Which are the most relevant dental brands?",
    "Which are the most recommended products?",
    "What are the preferred purchasing channels in different countries?",
    f"Evolution of {products[1]} adoption",
    f"Which is the country where {products[0]} is most successful?",
    f"Evolution of {producers[0]}'s loyalitization capability",
    f"Evolution of {products[0]}'s market in the last {intervals[2]} years",
    f"Difference in {products[1]} adoption between {countries[0]} and {countries[4]}",
]

### Retrieve documents

In [None]:
query = all_queries[6]

docs = vector_store.similarity_search_with_score(query, k = 4)

print("Query: " + query)
for doc in docs:
    print(doc[1])
    print("Sorgente: " + doc[0].metadata["source"])
    print(doc[0].page_content + "\n")

### Generate response from context

In [None]:
from langchain_ollama import OllamaLLM
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEndpoint


prompt = hub.pull("rlm/rag-prompt")
HF_TOKEN = "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"

llm = OllamaLLM(model = "llama3.2:3b")
# llm = HuggingFaceEndpoint(huggingfacehub_api_token = HF_TOKEN, 
#                           repo_id = "meta-llama/Llama-3.2-3B",
#                           task = "text-generation")

query = "Which brand has the highest ratio between the total digital awareness and the unaided awareness?"

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

### Evaluate model on a list of queries

In [None]:
from statistics import mean, variance


all_scores = []

for query in all_queries:
    docs_relevances = vector_store.similarity_search_with_score(query, k = 4)
    all_scores.append(docs_relevances[0][1])

avg = mean(all_scores)
var = variance(all_scores, avg)

print(f"{avg:.3f}, {var:.3f}")

### Compute number of different retrieved chunks between two models

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore


model_name_1 = "minilm-l6"
model_name_2 = "minilm-l12"
embeddings_1 = all_embeddings[model_name_1]["model"]
embeddings_2 = all_embeddings[model_name_2]["model"]

vector_store_path_1 = all_embeddings[model_name_1]["cleaned-path"]
vector_store_path_2 = all_embeddings[model_name_2]["cleaned-path"]

vector_store_1 = InMemoryVectorStore.load(path = vector_store_path_1, embedding = embeddings_1)
vector_store_2 = InMemoryVectorStore.load(path = vector_store_path_2, embedding = embeddings_2)

diff = 0

for query in all_queries:

    chunks_1 = vector_store_1.similarity_search(query, k = 4)
    chunks_2 = vector_store_2.similarity_search(query, k = 4)

    chunks_mapped_1 = list(map(lambda c: (c.metadata["source"], c.metadata["page"]), chunks_1))
    chunks_mapped_2 = list(map(lambda c: (c.metadata["source"], c.metadata["page"]), chunks_2))

    diff += sum([1 for tuple in chunks_mapped_1 if tuple not in chunks_mapped_2])

print(diff)