In [None]:
import os
import math
from langchain.schema.document import Document
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain.document_loaders.text import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
#loads a folder as data_path
DATA_PATH = "rag_data/oecd_policy_outlook"
#loads datapath for structured pdf documents
def load_documents():
    pdfdocument_loader = PyPDFDirectoryLoader(DATA_PATH)
    return pdfdocument_loader.load()

#loads datapath for unstructured pdf documents, performs ocr on files if necessary
def load_unstructered_documents():
    def unstructured_pdf_loader(file_path):
        return UnstructuredPDFLoader(
            file_path,
            mode="elements",
            strategy="auto",
        )
    # DirectoryLoader with glob for only PDFs and custom loader
    unstructured_loader = DirectoryLoader(
        path=DATA_PATH,
        glob="**/*.pdf",  # recursively find all PDFs
        loader_cls=unstructured_pdf_loader
    )
    return unstructured_loader.load()

#loads datapath for txt files
def load_txt_files():
    def custom_text_loader(DATA_PATH):
        return TextLoader(DATA_PATH, encoding="utf-8")
    loader = DirectoryLoader(DATA_PATH, glob="**/*.txt", loader_cls=custom_text_loader)
    return loader.load()

documents = load_documents() #loads pdf documents
#documents = load_unstructered_documents() #loads unstructured pdf documents, needed for iee, wee and worldbank
#documents = load_txt_files() #loads txt files needed for wetten

In [None]:
#Set chunksize and overlap
CHUNK_SIZE = 512   #character-based chunk size
OVERLAP = 40       #character overlap

#Initialises the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=OVERLAP,
    separators=["\n\n", "\n", ".", " ", ""],  #Splits on sentences and or paragraphs first
)

#creates chunks out of documents using the chunk_size and overlap
def chunk_documents(documents):
    chunks = []
    for doc_idx, doc in enumerate(documents):
        full_text = doc.page_content
        split_texts = text_splitter.split_text(full_text)
        offset = 0

        #finds chunk position to return in meta data
        for i, chunk in enumerate(split_texts):
            start = full_text.find(chunk, offset)
            if start == -1:
                #fallback
                start = offset
            end = start + len(chunk)
            offset = end  #moves offset forward

            chunks.append({
                "content": chunk,
                "start": start,
                "end": end,
                "doc_idx": doc_idx,
                "chunk_index": i
            })
    return chunks

#uses chunk_documents to chunk a document
chunks = chunk_documents(documents)

In [None]:
#initialises bge-m3
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    model_kwargs={"device": "cuda"},  # or "cpu" if no GPU
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
#embeds chunks from documents, adds metadata to the chunks
def embed_chunks(original_documents, chunk_data):
    embedded_documents = []

    for chunk_info in chunk_data:
        chunk = chunk_info["content"]
        doc_idx = chunk_info["doc_idx"]
        i = chunk_info["chunk_index"]
        chunk_start = chunk_info["start"]
        chunk_end = chunk_info["end"]

        source_doc = original_documents[doc_idx]
        doc_meta = source_doc.metadata if hasattr(source_doc, "metadata") else {}
        
        #retrieves metadata
        metadata = {
            "source": doc_meta.get("source", f"doc_{doc_idx}"),
            "document_id": doc_meta.get("document_id", f"doc_{doc_idx}"),
            "title": doc_meta.get("title"),
            "author": doc_meta.get("author"),
            "created_at": doc_meta.get("created_at"),
            "chunk_index": i,
            "chunk_char_start": chunk_start,
            "chunk_char_end": chunk_end,
            "language": doc_meta.get("language", "en"),
        }

        #adds the metadata to the chunks
        embedded_documents.append(Document(
            page_content=chunk,
            metadata=metadata
        ))

    #embeds all chunks
    embeddings = embedding_model.embed_documents(
        [doc.page_content for doc in embedded_documents]
    )

    return embedded_documents, embeddings

embedded_docs, embeddings = embed_chunks(documents, chunks)

In [None]:
#Adds documents to faiss vector database in index_dir. If there is not index_dir creates index_dir
INDEX_DIR = "faiss_index_bge_m3"

if os.path.exists(INDEX_DIR):
    #adds new chunks to vector db, by adding embeddings
    index = FAISS.load_local(INDEX_DIR, embeddings=embedding_model, allow_dangerous_deserialization=True)
    index.add_documents(embedded_docs)
    index.save_local(INDEX_DIR)
else:
    #creates an empty index and adds documents to this index
    index = FAISS.from_documents(embedded_docs, embedding=embedding_model)
    index.save_local(INDEX_DIR)

In [None]:
#Adds documents to faiss vector database in index_dir. If there is not index_dir creates index_dir
#here it is done in batching to make it possible with less computing power (RAM)
BATCH_SIZE = 400  # adjust based on your RAM capacity

num_batches = math.ceil(len(embedded_docs) / BATCH_SIZE)

if os.path.exists(INDEX_DIR):
    #adds new chunks to vector db, by adding embeddings
    index = FAISS.load_local(INDEX_DIR, embeddings=embedding_model, allow_dangerous_deserialization=True)
    for i in range(num_batches):
        batch_docs = embedded_docs[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
    index.add_documents(batch_docs)
    index.save_local(INDEX_DIR)
else:
    #creates an empty index and adds documents to this index
    for i in range(num_batches):
        batch_docs = embedded_docs[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
    index = FAISS.from_documents(batch_docs, embedding=embedding_model)
    index.save_local(INDEX_DIR)