### Get all image descriptions from ChromaDB file

In [None]:
import chromadb
import json


client = chromadb.PersistentClient(path = "chromadb")

collection = client.get_or_create_collection(
    name = "my_collection", metadata = {"hnsw:space": "cosine"})

image_descriptions = collection.get(where = {"type": "image"})["documents"]
ids = collection.get(where = {"type": "image"})["ids"]
ids = [id.split("_page_") for id in ids]
ids = [{"file": id[0], "page": int(id[-1].split("_image_")[0])} for id in ids]

for j in range(len(ids)):
    ids[j]["image_description"] = image_descriptions[j]

with open("image_descriptions/image_descriptions.json", "w") as f:
    json.dump(ids, f)


### Split documents into pages, with text only

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
import os


folder = "../sources"
files = []

for fname in os.listdir(folder):
    complete_path = os.path.join(folder, fname)
    if os.path.isfile(complete_path):
        files.append(complete_path)

docs = []
for file in files:
    loader = PyMuPDFLoader(file)
    async for doc in loader.alazy_load():
        docs.append(doc)


### Remove first page and index pages

In [24]:
docs = [doc for doc in docs if doc.metadata["page"] != 0]

docs = [doc for doc in docs 
        if not doc.page_content.lower().startswith(("index", "table of contents", "índice"))]

### Concatenate text and image descriptions

In [26]:
import json


with open("image_descriptions/image_descriptions.json", "r") as f:
    image_descriptions = json.load(f)

    for imd in image_descriptions:
        file = f"../sources/{imd["file"]}.pdf"
        page = imd["page"]
        doc = next(filter(lambda doc: doc.metadata["source"] == file and doc.metadata["page"] == page, docs), None)
        if doc != None:
            doc.page_content += f"\n{imd["image_description"]}"


### Clean text

In [28]:
import re


def decapitalize_content(pages: list[str]):

    """Turns document content into lower case"""

    for p in pages:
        p.page_content = p.page_content.lower()


def remove_non_ASCII(pages: list[str]):

    """Removes non ASCII characters from document. Not suitable for many non english languages 
    which have several non ASCII characters """

    for p in pages:
        if "non-en" not in p.metadata["keywords"]:
            p.page_content = re.sub(r"[^\x00-\x7F]+", "", p.page_content)


def remove_bullets(pages: list[str]):

    """Removes bullets from document """

    for p in pages:
        p.page_content = re.sub(r"^[→•▪\-*✔➢●✗]\s*", "", p.page_content, flags = re.MULTILINE)
        p.page_content = re.sub(r"\d+\.(?=\s*[a-zA-Z])", "", p.page_content)


def remove_escape(pages: list[str]):

    """Turns multiple consecutive escape characters into a single white space"""
    
    for p in pages:
        p.page_content = ' '.join(p.page_content.split())


remove_non_ASCII(docs)
decapitalize_content(docs)
remove_bullets(docs)
remove_escape(docs)