### Split documents into pages, with text only

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
import os


folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        files.append(complete_path)

docs = []
for file in files:
    loader = PyMuPDFLoader(file)
    async for doc in loader.alazy_load():
        docs.append(doc)

### Remove first pages and index pages

In [23]:
docs = [doc for doc in docs if doc.metadata["page"] != 0]

docs = [doc for doc in docs 
        if not doc.page_content.lower().startswith(("index", "table of contents", "índice"))]

### Concatenate text with image descriptions

In [24]:
import json


with open("../indexing/image_descriptions/image_descriptions.json", "r") as f:
    image_descriptions = json.load(f)

    for imd in image_descriptions:
        file = f"../sources/{imd["file"]}.pdf"
        page = imd["page"]
        doc = next(filter(lambda doc: doc.metadata["source"] == file and doc.metadata["page"] == page, docs), None)
        if doc != None:
            doc.page_content += f"\n{imd["image_description"]}"


### Clean text

In [25]:
import re


def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()


def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        if "non-en" not in p.metadata["keywords"]:
            p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)


def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•▪\-*✔➢●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)


def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())


remove_non_ASCII(docs)
decapitalize_content(docs)
remove_bullets(docs)
remove_escape(docs)

### Chunking

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import os


def merge_and_split(docs: list[Document], splitter):

    from collections import defaultdict
    

    docs_groups = defaultdict(list)
    for doc in docs:
        docs_groups[doc.metadata["source"]].append(doc)

    giant_docs = []
    for _, docs in docs_groups.items():
        giant_doc = {}
        metadata = {k: v for k, v in docs[0].metadata.items() if k != "page"}
        page_content = ""
        for doc in docs:
            page_content += doc.page_content
        giant_doc["metadata"] = metadata
        giant_doc["page_content"] = page_content
        giant_docs.append(giant_doc)

    files = []
    for gdoc in giant_docs:
        page_contents = splitter.split_text(gdoc["page_content"])
        files += [{"metadata": gdoc["metadata"], "page_content": pc} for pc in page_contents]

    files = [Document(metadata = file["metadata"], page_content = file["page_content"]) for file in files]

    return files


def save_chunks(pages: list, path: str):

    from langchain_core.load import dumpd
    import json
    import os


    if not os.path.exists(path):
        os.mkdir(path)
    for chunk in range(len(pages)):
        full_path = path + "/" + "chunk_" + str(chunk + 1)
        with open(full_path, "w") as ser_file:
            page_d = dumpd(pages[chunk])
            json.dump(page_d, ser_file)


chunk_size = 2_000
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_size/10,
    add_start_index = True,
)

docs = merge_and_split(docs, text_splitter)


### Setup agent for QA generation

In [4]:
import os
import getpass
from langchain_openai import AzureChatOpenAI
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings


os.environ["OPENAI_API_KEY"] = getpass.getpass()

llm = AzureChatOpenAI(
    azure_endpoint = "https://keystone1.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview",
    api_key = os.environ["OPENAI_API_KEY"],
    api_version = "2024-08-01-preview",
    azure_deployment = "gpt-4o",
    max_tokens = 256
)

path = "../indexing/models/Text+Images/multilingual/256_20/256_20_multilingual"
vector_store = InMemoryVectorStore.load(path = path, 
                                        embedding = HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-large"))


In [1]:
QA_generation_prompt_1 = """
Your task is to write a question and an answer given a context, country, year and, optionally, a target, which can be clinic (dental practise) and/or lab (laboratory).
Your question should be and answerable with a one-sentenced factual answer from the context.
This means that your question MUST NOT mention something like "according to the passage/map/picture/graph/chart/line" or "context".
Identify, if any, a country from the context and refer to it when you formulate the question, otherwise use the country provided.
Alongside the country, enrich the question with the year provided, for example: "Which is the top brand in France in 2023?"
Additionally, if the target is either clinic or lab (but not both) you should mention it in your question.
All words of both the answer and the question must be in english.

Provide your answer as follows:

Output:::
Question: (your question)
Answer: (your answer to the question)

Here is the context, country, year and target.

Context: {context}\n
Country: {country}\n
Year: {year}\n
Target: {target}\n
Output:::"""

In [8]:
QA_generation_prompt_2 = """
Your task is to write a question and an answer given a context, two vars and a constant.
Your question should be and answerable with a one-sentenced factual answer from the context.
This means that your question MUST NOT mention something like "according to the passage/map/picture/graph/chart/line/context".
In your question you should ask to make a comparison between data in the context between var_1 and var_2 provided. 
You should enrich your question with the costants and the vars provided if they are not None.
All words of both the answer and the question must be in english.

Provide your answer as follows:

Output:::
Question: (your question)
Answer: (your answer to the question)

Here is the context, var1, var2, constant1, constant2.

Context: {context}\n
Var1: {var1}\n
Var2: {var2}\n
Constant1: {constant1}\n
Constant2: {constant2}\n
Output:::"""

### Generate first part of dataset

In [None]:
import json
import random
from tqdm.auto import tqdm
from time import sleep


N_GENERATIONS = 15
outputs = []

for doc in tqdm(random.sample(docs, N_GENERATIONS)):
    context = doc.page_content
    keywords = json.loads("{" + doc.metadata["keywords"] + "}")
    country = keywords["country"]
    year = keywords["year"]
    target = keywords["target"] if "target" in keywords else None

    output_QA = llm.invoke(QA_generation_prompt_1.format(context = context, 
                                                       country = country, 
                                                       year = year, 
                                                       target = target)).content
    question = output_QA.split("Question: ")[-1].split("Answer: ")[0]
    answer = output_QA.split("Answer: ")[-1]
    outputs.append(
        {
            "context": context,
            "question": question,
            "answer": answer,
            "source_doc": doc.metadata["source"],
        }
    )
    sleep(5)

with open("../evaluation/dataset/all_QA_with_images_2.json", "w") as f:
    json.dump(outputs, f, indent = 4, ensure_ascii = False)


### Generate second part of dataset

In [2]:
import random
import json
from itertools import product
from time import sleep


def random_matcher(const_list1, const_list2, var_list, n_iter):

    const_list_prod = [(i1, i2) for i1, i2 in product(const_list1["values"], const_list2["values"]) if i1 != i2] if const_list2 else const_list1["values"]
    prod_list = list(product(const_list_prod, [(i1, i2) for i1, i2 in product(var_list["values"], var_list["values"]) if i1 != i2]))
    if const_list2 == None:
        prod_list = [(tuple[0], tuple[1][0], tuple[1][1]) for tuple in prod_list]
    else:
        prod_list = [(tuple[0][0], tuple[0][1], tuple[1][0], tuple[1][1]) for tuple in prod_list]

    return [random_tuple for random_tuple in random.sample(prod_list, n_iter)]


def generate_qa(docs, var1, var2, const1, const2):
    context = " ".join([doc.page_content for doc in docs])
    output_QA = llm.invoke(QA_generation_prompt_2
            .format(context = context, 
                    var1 = var1, 
                    var2 = var2, 
                    constant1 = const1, 
                    constant2 = const2)).content
    if output_QA == "idk.":
        return None
    question = output_QA.split("Question: ")[-1].split("Answer: ")[0]
    answer = output_QA.split("Answer: ")[-1]
    sleep(5)
    
    if question == "idk." or answer == "idk.":
        return None
    return {
        "context": context,
        "question": question,
        "answer": answer,
        "source_doc": [doc.metadata["source"] for doc in docs],
    }
    

def save_dataset(filename, data):
    path = "../evaluation/dataset"
    with open(f"{path}/{filename}", "w") as f:
        json.dump(data, f, indent = 4, ensure_ascii = False)


years = {"filterable": True, "values": [2015, 2017, 2019, 2021, 2022, 2023, 2024]}
countries = {"filterable": False, "values": ["France", "Spain", "Germany", "UK", "Italy", "Brazil"]}
brands = {"filterable": False, "values": ["3shape", "Dentsply Sirona", "Ivoclair", "Bego", "3M", "Ines Icore", "Caresstream", "Amann Girbach"]}
products = {"filterable": False, "values": ["Intraoral scanner", "3d printer"]}

all_QA = []


In [None]:
from tqdm.auto import tqdm


# year -> country
qa1 = []
tuples = random_matcher(years, None, countries, n_iter = 100)
for tuple in tqdm(tuples):
    (year, country1, country2) = tuple[0], tuple[1], tuple[2]
    docs1 = vector_store.similarity_search(f"{country1}", k = 4, filter = lambda doc: f"{year}" in doc.metadata["keywords"])
    docs2 = vector_store.similarity_search(f"{country2}", k = 4, filter = lambda doc: f"{year}" in doc.metadata["keywords"])
    qa = generate_qa(docs1 + docs2, var1 = country1, var2 = country2, const1 = year, const2 = None)
    if qa != None:
        qa1.append(qa)

save_dataset("all_QA_years_countries.json", qa1)

#(brand, country) -> year
qa2 = []
tuples = random_matcher(brands, countries, years, n_iter = 10)
for tuple in tqdm(tuples):
    (brand, country, year1, year2) = tuple[0], tuple[1], tuple[2], tuple[3]
    docs1 = vector_store.similarity_search(f"{country}, {brand}", k = 4, filter = lambda doc: f"{year1}" in doc.metadata["keywords"])
    docs2 = vector_store.similarity_search(f"{country}, {brand}", k = 4, filter = lambda doc: f"{year2}" in doc.metadata["keywords"])
    qa = generate_qa(docs1 + docs2, var1 = year1, var2 = year2, const1 = country, const2 = brand)
    if qa != None:
        qa2.append(qa)

save_dataset("all_QA_brands&countries_years.json", qa2)

#(country, year) -> brand
qa3 = []
tuples = random_matcher(countries, years, brands, n_iter = 100)
for tuple in tqdm(tuples):
    (country, year, brand1, brand2) = tuple[0], tuple[1], tuple[2], tuple[3]
    docs1 = vector_store.similarity_search(f"{country}, {brand1}", k = 4, filter = lambda doc: f"{year}" in doc.metadata["keywords"])
    docs2 = vector_store.similarity_search(f"{country}, {brand2}", k = 4, filter = lambda doc: f"{year}" in doc.metadata["keywords"])
    qa = generate_qa(docs1 + docs2, var1 = brand1, var2 = brand2, const1 = country, const2 = year)
    if qa != None:
        qa3.append(qa)

save_dataset("all_QA_countries&years_brands.json", qa3)
