### Split documents into pages, with text only

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
import os


folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        files.append(complete_path)

docs = []
for file in files:
    loader = PyMuPDFLoader(file)
    async for doc in loader.alazy_load():
        docs.append(doc)

### Remove first pages and index pages

In [6]:
docs = [doc for doc in docs if doc.metadata["page"] != 0]

docs = [doc for doc in docs 
        if not doc.page_content.lower().startswith(("index", "table of contents", "índice"))]

### Concatenate text with image descriptions

In [7]:
import json


with open("../indexing/image_descriptions/image_descriptions.json", "r") as f:
    image_descriptions = json.load(f)

    for imd in image_descriptions:
        file = f"../sources/{imd["file"]}.pdf"
        page = imd["page"]
        doc = next(filter(lambda doc: doc.metadata["source"] == file and doc.metadata["page"] == page, docs), None)
        if doc != None:
            doc.page_content += f"\n{imd["image_description"]}"


### Clean text

In [8]:
import re


def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()


def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        if "non-en" not in p.metadata["keywords"]:
            p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)


def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•▪\-*✔➢●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)


def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())


remove_non_ASCII(docs)
decapitalize_content(docs)
remove_bullets(docs)
remove_escape(docs)

### Chunking

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import os


def merge_and_split(docs: list[Document], splitter):

    from collections import defaultdict
    

    docs_groups = defaultdict(list)
    for doc in docs:
        docs_groups[doc.metadata["source"]].append(doc)

    giant_docs = []
    for _, docs in docs_groups.items():
        giant_doc = {}
        metadata = {k: v for k, v in docs[0].metadata.items() if k != "page"}
        page_content = ""
        for doc in docs:
            page_content += doc.page_content
        giant_doc["metadata"] = metadata
        giant_doc["page_content"] = page_content
        giant_docs.append(giant_doc)

    files = []
    for gdoc in giant_docs:
        page_contents = splitter.split_text(gdoc["page_content"])
        files += [{"metadata": gdoc["metadata"], "page_content": pc} for pc in page_contents]

    files = [Document(metadata = file["metadata"], page_content = file["page_content"]) for file in files]

    return files


def save_chunks(pages: list, path: str):

    from langchain_core.load import dumpd
    import json
    import os


    if not os.path.exists(path):
        os.mkdir(path)
    for chunk in range(len(pages)):
        full_path = path + "/" + "chunk_" + str(chunk + 1)
        with open(full_path, "w") as ser_file:
            page_d = dumpd(pages[chunk])
            json.dump(page_d, ser_file)


chunk_size = 2_000
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_size/10,
    add_start_index = True,
)

docs = merge_and_split(docs, text_splitter)


### Setup agent for QA generation

In [1]:
from langchain_openai import AzureChatOpenAI
import os
import getpass


os.environ["OPENAI_API_KEY"] = getpass.getpass()

llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o",
    max_tokens = 256
)


In [38]:
QA_generation_prompt = """
Your task is to write a question and an answer given a context, country, year and, optionally, a target, which can be clinic (dental practise) and/or lab (laboratory).
Your question should be answerable with a one-sentenced answer from the context.
Your question should be formulated in the same style as questions users could ask in a search engine.
This means that your question MUST NOT mention something like "according to the passage" or "context".
Your question should be enriched with the country and year provided, just like "What is the most quoted brand in France in 2018?". 
Additionally, if the target is either clinic or lab (but not both) you should mention it in your question.
All words of both the answer and the question must be in english.
When your question asks for the best item out of an ensemble, instead of picking just the best, try to include a leaderboard of at least three items in your answer.

Provide your answer as follows:

Output:::
Question: (your question)
Answer: (your answer to the question)

Here is the context, country, year and target.

Context: {context}\n
Country: {country}\n
Year: {year}\n
Target: {target}\n
Output:::"""

### Generate QA couples

In [None]:
import json
import random
from tqdm.auto import tqdm
from time import sleep


N_GENERATIONS = 100
outputs = []

for doc in tqdm(random.sample(docs, N_GENERATIONS)):
    context = doc.page_content
    keywords = json.loads("{" + doc.metadata["keywords"] + "}")
    country = keywords["country"]
    year = keywords["year"]
    target = keywords["target"] if "target" in keywords else None

    output_QA = llm.invoke(QA_generation_prompt.format(context = context, 
                                                       country = country, 
                                                       year = year, 
                                                       target = target)).content
    question = output_QA.split("Question: ")[-1].split("Answer: ")[0]
    answer = output_QA.split("Answer: ")[-1]
    outputs.append(
        {
            "context": context,
            "question": question,
            "answer": answer,
            "source_doc": doc.metadata["source"],
        }
    )
    sleep(5)

with open("../evaluation/dataset/all_QA_with_images_2.json", "w") as f:
    json.dump(outputs, f, indent = 4, ensure_ascii = False)


In [1]:
edits = {
    "../sources/BEQ_2301_OVERALL_multi.pdf": "\"target\": \"clinic\"",
    "../sources/OMNI_DIGITAL_EU_15_CLI_.pdf": "\"target\": \"clinic\"",
    "../sources/OMNI_DIGITAL_EU_15_CLI_LAB_Executive_Summary_.pdf": "\"target\": \"all\"",
    "../sources/OMNI_DIGITAL_EU_15_LAB_.pdf": "\"target\": \"lab\"",
    "../sources/OMNI_DIGITAL_EU_21_CLI_LAB_INTEGRATED_.pdf": "\"target\": \"all\"",
    "../sources/OMNI_DIGITAL_ITA_19_CLI_LAB_INTEGRATED_.pdf": "\"target\": \"all\"",
    "../sources/OMNI_DIGITAL_SPA_19_CLI_.pdf": "\"target\": \"clinic\"",
    "../sources/OMNI_DIGITAL_SPA_19_CLI_LAB_INTEGRATED_spagnolo.pdf": "\"target\": \"all\"",
    "../sources/OMNI_DIGITAL_SPA_19_LAB_.pdf": "\"target\": \"lab\"",
    "../sources/OMNI-DIGITAL_ITA_17_CLI_.pdf": "\"target\": \"clinic\"",
    "../sources/OMNI-OMNI-DIGITAL_ITA_23_CLI_.pdf": "\"target\": \"clinic\"",
}

import json

for model in ["minilm_l6", "minilm_l12", "mpnet_base_v2", "multilingual", "text_embedding_3_large"]:
    for chunk_type in ["page_chunking", "fixed_number"]:
        for chunk_size in [256, 384]:
            for chunk_overlap in [0, 20, 50, 100]:

                if chunk_type == "fixed_number":
                    chunking = f"{chunk_size}_{chunk_overlap}"
                else:
                    chunking = chunk_type

                path = f"../indexing/models/Text+Images/{model}/{chunking}/{chunking}_{model}"
                with open(path, "r") as fr:
                    vstore = json.load(fr)

                    for id in vstore:
                        for edit in edits:
                            if vstore[id]["metadata"]["source"] == edit:
                                vstore[id]["metadata"]["keywords"] += f",{edits[edit]}"
                                break

                with open(path, "w") as fw:
                    json.dump(vstore, fw)


In [None]:
path = "../indexing/models/Text+Images/minilm_l12/384_100/384_100_minilm_l12"
with open(path, "r") as f:
    vstore = json.load(f)

    for id in vstore:
        print(vstore[id]["metadata"]["keywords"])
