In [None]:
!pip install -q unstructured python-docx aiofiles==22.1.0 docx2txt "unstructured[pdf]" playwright pdf2image html2text rank_bm25 codecarbon
!pip install -q sentence_transformers faiss-cpu transformers accelerate peft bitsandbytes trl langchain langchain_community libmagic python-magic docling
!apt install poppler-utils

In [None]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")

In [None]:
"""
RAG Pipeline - Document parsing & Vector Store generation (Docling & LangChain)
-------------------------------------------------------------------------------
This script demonstrates a robust pipeline for building both Docling-based
and LangChain-based FAISS + BM25 stores from a set of PDF reports.
It includes:

1) **Docling-based** parsing:
   - PDF => HTML/Markdown (via Docling), preserving table structures
   - Custom chunking that never splits tables
   - Post-processing to merge small chunks

2) **LangChain-based** parsing:
   - DirectoryLoader for PDFs
   - RecursiveCharacterTextSplitter with user-defined chunk size & overlap
   - Post-processing to merge small chunks as well

Both approaches produce per-company FAISS + BM25 indexes plus a global index
that aggregates all chunks. The final structure on disk looks like this:

FAISS_DB/
 ┣ docling/
 ┃  ┣ SP500/
 ┃  ┃  ┣ Apple/
 ┃  ┃  ┃  ┣ faiss_index…
 ┃  ┃  ┃  ┗ retrievers/keyword_retriever.pkl
 ┃  ┗ GLOBAL_DB/
 ┃     ┣ faiss_index…
 ┃     ┗ retrievers/keyword_retriever.pkl
 ┣ langchain/
 ┃  ┣ SP500/
 ┃  ┃  ┣ Apple/
 ┃  ┃  ┃  ┣ faiss_index…
 ┃  ┃  ┃  ┗ retrievers/keyword_retriever.pkl
 ┃  ┗ GLOBAL_DB/
 ┃     ┣ faiss_index…
 ┃     ┗ retrievers/keyword_retriever.pkl

Author: [PLADIFES]
Date: [17_02_2025]
"""

import os
import re
import pickle
from pathlib import Path
from typing import List

# ========== 3rd-party libs for text parsing and NLP ==========
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

# ========== LangChain imports ==========
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LC_Document
from langchain.document_loaders import DirectoryLoader
from langchain.retrievers import BM25Retriever

# ========== Docling imports for PDF -> HTML conversion ==========
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

# ========== Optional utility from langchain_community (for distance strategies) ==========
from langchain_community.vectorstores.utils import DistanceStrategy

# ----------------------- Emissions -----------------------
from codecarbon import EmissionsTracker

###############################################################################
#                          GLOBAL CONFIGURATION
###############################################################################
#: The HuggingFace model name for embeddings
EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # "intfloat/multilingual-e5-large"

#: Chunk size for splitting text (both Docling and LangChain).
#: You can change this value or even parse from command-line arguments if desired.
CHUNK_SIZE = 2048 # 512

#: Overlap automatically computed as chunk_size // 10
CHUNK_OVERLAP = CHUNK_SIZE // 10

#: Minimum chunk size for merging small chunks in the Docling approach
MIN_CHUNK_SIZE = 512 # 128

#: Whether to also save `.md` output from Docling
STORE_MD = True

#: Accelerator device to use for Docling. Options: CPU, CUDA, MPS, AUTO
ACCELERATOR_DEVICE = AcceleratorDevice.CUDA

#: Number of threads for Docling
NUM_THREADS = 8

#: Root folder containing subfolders (SP500, CAC40, etc.) with PDF reports
target_directory = "/kaggle/input/sustainability-reports/sustain_reports/"

#: Directory in which to store all resulting FAISS indexes
db_storage_path = "./FAISS_DB/"

#: Folder names to look for under `target_directory` (e.g. SP500, CAC40, etc.)
INDEXES = ("SP500", "CAC40", "Other")

#: Directory to store Docling-extracted .html/.md
EXTRACTED_TEXT_BASE = "./docling_extracted"

#: Docling pipeline options (OCR, table structure, device)
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=NUM_THREADS,
    device=ACCELERATOR_DEVICE,
)
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

#: Enable profiling of Docling pipeline timings (optional)
settings.debug.profile_pipeline_timings = True

###############################################################################
#                       GENERIC UTILITY FUNCTIONS
###############################################################################
def ensure_folder_exists(file_path: str) -> None:
    """
    Ensure the directory for a given file_path exists, creating subdirectories if needed.
    """
    folder = os.path.dirname(file_path)
    if folder and not os.path.exists(folder):
        os.makedirs(folder)

def replace_extension(file_path: str, new_ext: str) -> str:
    """
    Replace the extension of `file_path` with `new_ext`.
    e.g.: replace_extension("/path/to/file.pdf", ".html") -> "/path/to/file.html"
    """
    base, _ = os.path.splitext(file_path)
    return base + new_ext

def get_relative_path(file_path: str, root_folder: str) -> str:
    """
    Return the path of `file_path` relative to `root_folder`.
    e.g.: /root/sustain_reports/SP500/Company/file.pdf => SP500/Company/file.pdf
    """
    return os.path.relpath(file_path, start=root_folder)

def get_extracted_file_path(
    pdf_file_path: str,
    root_pdf_folder: str,
    base_extracted_folder: str,
    extension: str = ".html"
) -> str:
    """
    Produce a path for a derived file (e.g. ".html") that mirrors the PDF’s folder structure.
    e.g.:
        pdf_file_path="/root/sustain_reports/SP500/Company/report.pdf"
        => "./docling_extracted/SP500/Company/report.html"
    """
    relative_path = get_relative_path(pdf_file_path, root_pdf_folder)
    new_relative_path = replace_extension(relative_path, extension)
    return os.path.join(base_extracted_folder, new_relative_path)

###############################################################################
#                       MERGING SMALL CHUNKS (Common)
###############################################################################
def combine_headings(heading_a: str, heading_b: str) -> str:
    """
    Combine two heading strings (semicolon-separated), ignoring duplicates.
    e.g. "Intro;Background" + "Background;Conclusion" => "Background; Conclusion; Intro"
    """
    set_a = {h.strip() for h in heading_a.split(";") if h.strip()}
    set_b = {h.strip() for h in heading_b.split(";") if h.strip()}
    combined = list(set_a.union(set_b))
    return "; ".join(sorted(combined))

def merge_small_chunks(
    docs: List[Document],
    min_chunk_size: int,
    max_chunk_size: int
) -> List[Document]:
    """
    Merge consecutive chunks if:
      - The first chunk is below `min_chunk_size`
      - The combined size doesn't exceed `max_chunk_size`

    For Docling-based chunks, we track metadata["type"] (table/mixed/paragraph).
    For LangChain-based, we can default to "paragraph" for everything.

    Returns a new list of Documents with merged content & metadata.
    """
    merged = []
    i = 0
    while i < len(docs):
        current_doc = docs[i]
        current_len = len(current_doc.page_content)

        # If chunk is large enough or it's the last chunk, just append
        if current_len >= min_chunk_size or i == (len(docs) - 1):
            merged.append(current_doc)
            i += 1
            continue

        # Attempt to merge with the next chunk
        next_doc = docs[i + 1]
        combined_len = current_len + len(next_doc.page_content)

        if combined_len <= max_chunk_size:
            # Merge them
            new_content = f"{current_doc.page_content}\n{next_doc.page_content}"

            # Merge headings if present
            h1 = current_doc.metadata.get("heading", "")
            h2 = next_doc.metadata.get("heading", "")
            all_headings = combine_headings(h1, h2)

            # Determine new chunk type
            t1 = current_doc.metadata.get("type", "paragraph")
            t2 = next_doc.metadata.get("type", "paragraph")
            if "table" in [t1, t2] or "mixed" in [t1, t2]:
                merged_type = "mixed"
            else:
                merged_type = "paragraph"

            # Construct the new doc
            merged_meta = dict(current_doc.metadata)
            merged_meta["heading"] = all_headings
            merged_meta["type"] = merged_type
            new_doc = Document(page_content=new_content.strip(), metadata=merged_meta)

            merged.append(new_doc)
            i += 2  # Skip the next doc
        else:
            # Can't merge due to size, so just append current and move on
            merged.append(current_doc)
            i += 1
    return merged

###############################################################################
#                       EMBEDDING + FAISS + BM25 (Common)
###############################################################################
def compute_embeddings(
    documents: List[Document],
    embedding_model_name: str = EMBEDDING_MODEL_NAME
):
    """
    Build a FAISS index + BM25 from the provided Documents (already chunked).
    Returns (faiss_db, bm25_retriever).
    """
    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        model_kwargs={"trust_remote_code": True},
        encode_kwargs={"normalize_embeddings": True},
    )

    faiss_db = FAISS.from_documents(
        documents,
        embeddings,
        distance_strategy=DistanceStrategy.COSINE
    )
    keyword_retriever = BM25Retriever.from_documents(documents)

    return faiss_db, keyword_retriever

def save_faiss_and_bm25(
    faiss_db: FAISS,
    keyword_retriever: BM25Retriever,
    output_path: str
):
    """
    Persist the FAISS + BM25 retriever to disk under `output_path`.
    """
    faiss_db.save_local(output_path)
    retriever_path = os.path.join(output_path, "retrievers")
    os.makedirs(retriever_path, exist_ok=True)
    with open(os.path.join(retriever_path, "keyword_retriever.pkl"), "wb") as f:
        pickle.dump(keyword_retriever, f)

###############################################################################
#          DOCLING-BASED PIPELINE (Preserve Table Structure)
###############################################################################
def load_or_convert_pdf_with_docling(
    pdf_file_path: str,
    root_pdf_folder: str,
    converter: DocumentConverter
) -> str:
    """
    Load or convert a single PDF file to HTML/Markdown via Docling.

    1) Checks if .html / .md with the same base filename exist next to the PDF
       or in `EXTRACTED_TEXT_BASE`. If yes, load it.
    2) Otherwise, convert the PDF with Docling, store .html (and .md if STORE_MD).
    3) Return the HTML/MD string.
    """
    # Potential local extractions next to the PDF
    local_html_path = replace_extension(pdf_file_path, ".html")
    local_md_path = replace_extension(pdf_file_path, ".md")

    if os.path.exists(local_html_path):
        print(f"[Docling] Found local HTML '{local_html_path}'.")
        with open(local_html_path, "r", encoding="utf-8") as f:
            return f.read()

    if os.path.exists(local_md_path):
        print(f"[Docling] Found local MD '{local_md_path}'.")
        with open(local_md_path, "r", encoding="utf-8") as f:
            return f.read()

    # Potential paths in docling_extracted/...
    extracted_html_path = get_extracted_file_path(
        pdf_file_path, root_pdf_folder, EXTRACTED_TEXT_BASE, extension=".html"
    )
    extracted_md_path = get_extracted_file_path(
        pdf_file_path, root_pdf_folder, EXTRACTED_TEXT_BASE, extension=".md"
    )

    if os.path.exists(extracted_html_path):
        print(f"[Docling] Found existing extracted HTML '{extracted_html_path}'.")
        with open(extracted_html_path, "r", encoding="utf-8") as f:
            return f.read()

    if STORE_MD and os.path.exists(extracted_md_path):
        print(f"[Docling] Found existing extracted MD '{extracted_md_path}'.")
        with open(extracted_md_path, "r", encoding="utf-8") as f:
            return f.read()

    # If none found, convert now with Docling
    conversion_result = converter.convert(Path(pdf_file_path))
    doc = conversion_result.document
    html_text = doc.export_to_html()
    md_text = doc.export_to_markdown()

    # Save locally
    ensure_folder_exists(extracted_html_path)
    with open(extracted_html_path, "w", encoding="utf-8") as f:
        f.write(html_text)

    if STORE_MD:
        ensure_folder_exists(extracted_md_path)
        with open(extracted_md_path, "w", encoding="utf-8") as f:
            f.write(md_text)

    # Optional timing debug
    total_time = conversion_result.timings["pipeline_total"].times
    print(f"[Docling] Converted '{pdf_file_path}' in {total_time} seconds.")
    return html_text

def docling_load_documents(folder_path: str, root_pdf_folder: str, company_name: str) -> List[Document]:
    """
    For all .pdf in `folder_path`, convert them to HTML via Docling,
    returning a list of (un-chunked) Documents with `.page_content` = HTML.
    """
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options
            )
        }
    )

    all_docs = []
    for fn in os.listdir(folder_path):
        if fn.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, fn)
            html_text = load_or_convert_pdf_with_docling(
                pdf_path, root_pdf_folder, converter
            )
            all_docs.append(
                Document(
                    page_content=html_text,
                    metadata={
                        "source": pdf_path,
                        "company": company_name,
                        "filename": fn,
                    }
                )
            )
    return all_docs

def extract_table_html(table_element) -> str:
    """
    Return the <table> HTML exactly as-is.
    Useful for LLMs that can interpret HTML structure.
    """
    return str(table_element)

def split_section_by_length(text: str, max_len: int) -> List[str]:
    """
    If `text` exceeds max_len, split it on sentence boundaries (.,?!) into smaller chunks.
    Otherwise, return [text].
    """
    if len(text) <= max_len:
        return [text]

    chunks = []
    buffer = []
    length_so_far = 0
    tokens = re.split(r"([.?!])", text)  # keep punctuation

    for i in range(0, len(tokens), 2):
        sentence = tokens[i].strip()
        if i + 1 < len(tokens):  # re-append punctuation
            sentence += tokens[i + 1]
        if (length_so_far + len(sentence)) > max_len:
            # Flush buffer
            chunks.append(" ".join(buffer).strip())
            buffer = [sentence]
            length_so_far = len(sentence)
        else:
            buffer.append(sentence)
            length_so_far += len(sentence)

    # Final flush
    if buffer:
        chunks.append(" ".join(buffer).strip())
    return chunks

def html_to_structured_chunks(
    html_text: str,
    pdf_source: str,
    company_name: str,
    file_name: str,
    chunk_size: int = CHUNK_SIZE,
    combine_tables_with_heading: bool = True
) -> List[Document]:
    """
    Convert HTML into structured chunks, preserving table integrity.
      - Headings grouped with subsequent paragraphs/tables if combine_tables_with_heading=True
      - Large sections split by `split_section_by_length`
      - Then merges small chunks if needed
    """
    soup = BeautifulSoup(html_text, "html.parser")

    # Temporary list to hold big sections
    section_texts = []
    current_buffer = ""

    def flush_buffer():
        nonlocal current_buffer
        txt = current_buffer.strip()
        if txt:
            section_texts.append(txt)
        current_buffer = ""

    # 1) Extract headings, paragraphs, tables in order
    for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "table"]):
        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            # flush old buffer
            flush_buffer()
            heading_text = element.get_text(separator=" ", strip=True)
            current_buffer = f"[START HEADING] {heading_text} [END HEADING]\n"

        elif element.name == "p":
            paragraph_text = element.get_text(separator=" ", strip=True)
            if paragraph_text:
                current_buffer += paragraph_text + "\n"

        elif element.name == "table":
            table_html_block = extract_table_html(element)
            wrapped_table = f"[START TABLE]\n{table_html_block}\n[END TABLE]\n"

            if combine_tables_with_heading and current_buffer:
                # Append to the heading buffer
                current_buffer += wrapped_table
            else:
                flush_buffer()
                current_buffer = wrapped_table
                flush_buffer()

    flush_buffer()  # leftover

    # 2) Split large sections by length (unless they contain a table marker)
    chunk_list = []
    chunk_counter = 0
    for section_text in section_texts:
        if "[START TABLE]" in section_text:
            splitted = [section_text]
        else:
            splitted = split_section_by_length(section_text, chunk_size)

        for mini_text in splitted:
            # Identify heading in the chunk
            heading_match = re.search(r"\[START HEADING\](.*?)\[END HEADING\]", mini_text)
            heading_str = heading_match.group(1).strip() if heading_match else ""

            # Determine chunk type
            if "[START TABLE]" in mini_text:
                if heading_str:
                    chunk_type = "mixed"
                else:
                    chunk_type = "table"
            else:
                chunk_type = "paragraph"

            meta = {
                "source": pdf_source,
                "company": company_name,
                "filename": file_name,
                "chunk_index": chunk_counter,
                "type": chunk_type,
                "heading": heading_str,
            }
            chunk_list.append(
                Document(page_content=mini_text.strip(), metadata=meta)
            )
            chunk_counter += 1

    # 3) Merge small chunks
    merged = merge_small_chunks(
        chunk_list,
        min_chunk_size=MIN_CHUNK_SIZE,
        max_chunk_size=chunk_size
    )
    return merged

def build_faiss_for_company_docling(
    company_path: str,
    output_path: str,
    root_pdf_folder: str
) -> List[Document]:
    """
    For a single company's PDF folder, build a Docling-based FAISS + BM25 store:
      1) Docling => HTML
      2) Chunk with table-preservation
      3) Merge small chunks
      4) Save DB & retriever
    Returns all chunked Documents for optional global usage.
    """
    company_name = os.path.basename(company_path)
    unchunked_docs = docling_load_documents(company_path, root_pdf_folder, company_name)

    if not unchunked_docs:
        print(f"[Docling] No PDFs in {company_path}. Skipping.")
        return []

    all_chunks = []
    for doc in unchunked_docs:
        doc_chunks = html_to_structured_chunks(
            html_text=doc.page_content,
            pdf_source=doc.metadata["source"],
            company_name=doc.metadata["company"],
            file_name=doc.metadata["filename"],
            chunk_size=CHUNK_SIZE,
            combine_tables_with_heading=True
        )
        all_chunks.extend(doc_chunks)

    # Build & save
    db, bm25 = compute_embeddings(all_chunks)
    save_faiss_and_bm25(db, bm25, output_path)
    print(f"[Docling] Saved to '{output_path}' => {len(all_chunks)} chunks.")
    return all_chunks

def build_all_faiss_stores_docling(
    target_directory: str,
    db_storage_path: str,
    indexes: tuple = INDEXES
):
    """
    Build docling-based FAISS+BM25 for each company + a global "docling/GLOBAL_DB".
    """
    docling_base_path = os.path.join(db_storage_path, "docling")
    os.makedirs(docling_base_path, exist_ok=True)

    global_chunks = []

    for index_name in indexes:
        index_path = os.path.join(target_directory, index_name)
        if not os.path.isdir(index_path):
            print(f"[WARNING] Docling: folder '{index_path}' not found.")
            continue

        for company_folder in os.listdir(index_path):
            company_path = os.path.join(index_path, company_folder)
            if not os.path.isdir(company_path):
                continue

            output_path = os.path.join(docling_base_path, index_name, company_folder)
            cdocs = build_faiss_for_company_docling(
                company_path=company_path,
                output_path=output_path,
                root_pdf_folder=target_directory
            )
            global_chunks.extend(cdocs)

    # Global
    if global_chunks:
        print(f"[Docling] Building GLOBAL_DB from {len(global_chunks)} chunks.")
        db, bm25 = compute_embeddings(global_chunks)
        global_db_path = os.path.join(docling_base_path, "GLOBAL_DB")
        save_faiss_and_bm25(db, bm25, global_db_path)
        print(f"[Docling] GLOBAL_DB saved => {global_db_path}.")
    else:
        print("[Docling] No chunks found, skipping global DB.")

###############################################################################
#          LANGCHAIN-BASED PIPELINE (DirectoryLoader + Splitter)
###############################################################################
MARKDOWN_SEPARATORS = [
    r"\n#{1,6} ",
    r"```\n",
    r"\n\*\*\*+\n",
    r"\n---+\n",
    r"\n___+\n",
    r"\n\n",
    r"\n",
    " ",
    "",
]

def load_documents_langchain(files_path: str) -> List[LC_Document]:
    """
    Load PDF documents from `files_path` using DirectoryLoader (recursively for *.pdf).
    Returns a list of LC_Document objects (raw text).
    """
    loader = DirectoryLoader(files_path, glob="**/*.pdf", use_multithreading=True)
    documents = loader.load()
    return documents

def split_documents_langchain(
    documents: List[LC_Document],
    tokenizer_name: str = EMBEDDING_MODEL_NAME,
    chunk_size: int = CHUNK_SIZE,
    chunk_overlap: int = CHUNK_OVERLAP,
) -> List[Document]:
    """
    Split raw LangChain documents into chunked Documents (max length = chunk_size tokens,
    overlap = chunk_overlap tokens). Then merges small chunks if needed.

    Returns a list of LangChain `Document` objects with updated metadata.
    """
    # 1) Use a huggingface-based text splitter
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    # 2) Split
    splitted_docs = []
    for doc in documents:
        splitted_docs.extend(text_splitter.split_documents([doc]))

    # 3) Convert splitted docs => langchain.schema.Document
    #    We also set `type=paragraph` so that we can unify merging logic
    converted_docs = []
    for i, d in enumerate(splitted_docs):
        meta = dict(d.metadata)
        meta["type"] = "paragraph"  # For merging logic
        meta["chunk_index"] = i
        meta["heading"] = meta.get("heading", "")  # Empty if not present
        # Create a new Document
        converted_docs.append(
            Document(page_content=d.page_content, metadata=meta)
        )

    # 4) Merge small chunks using the same function as Docling
    merged_docs = merge_small_chunks(
        converted_docs,
        min_chunk_size=MIN_CHUNK_SIZE,
        max_chunk_size=chunk_size
    )

    # Filter out empty docs after merging
    final_docs = [doc for doc in merged_docs if doc.page_content.strip()]
    return final_docs

def build_faiss_for_company_langchain(company_path: str, output_path: str) -> List[Document]:
    """
    Build a FAISS + BM25 store for PDFs in `company_path`, using:
      - DirectoryLoader
      - RecursiveCharacterTextSplitter
      - merging small chunks
      - huggingface embeddings
      - saving to `output_path`
    Returns the chunked Docs for optional global usage.
    """
    # Derive the company name from the folder name
    company_name = os.path.basename(company_path)  # e.g. "Apple"

    # 1) Load raw PDF documents using DirectoryLoader
    raw_docs = load_documents_langchain(company_path)
    if not raw_docs:
        print(f"[LangChain] No PDFs in {company_path}. Skipping.")
        return []

    # 2) Inject metadata you want to propagate
    for d in raw_docs:
        # The 'source' key typically has the full file path
        source_path = d.metadata.get("source", "")
        pdf_filename = os.path.basename(source_path)
        
        # You can set or overwrite fields in the metadata dict:
        d.metadata["company"] = company_name     # <-- add the company
        d.metadata["filename"] = pdf_filename    # <-- add the filename
        # You can also keep "source" (full path) if you like:
        # d.metadata["pdf_path"] = source_path

    # 3) Split into smaller chunks (with overlap)
    chunked = split_documents_langchain(
        raw_docs,
        tokenizer_name=EMBEDDING_MODEL_NAME,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    if not chunked:
        print(f"[LangChain] After splitting, no non-empty chunks in '{company_path}'.")
        return []

    # 4) Build embeddings + store
    db, bm25 = compute_embeddings(chunked)
    save_faiss_and_bm25(db, bm25, output_path)
    print(f"[LangChain] Saved => '{output_path}', with {len(chunked)} chunks.")
    return chunked

def build_all_faiss_stores_langchain(
    target_directory: str,
    db_storage_path: str,
    indexes: tuple = INDEXES
):
    """
    Build langchain-based FAISS+BM25 for each company + a global "langchain/GLOBAL_DB".
    """
    langchain_base_path = os.path.join(db_storage_path, "langchain")
    os.makedirs(langchain_base_path, exist_ok=True)

    global_docs = []

    for index_name in indexes:
        index_path = os.path.join(target_directory, index_name)
        if not os.path.isdir(index_path):
            print(f"[WARNING] LangChain: folder '{index_path}' not found.")
            continue

        for company_folder in os.listdir(index_path):
            company_path = os.path.join(index_path, company_folder)
            if not os.path.isdir(company_path):
                continue

            output_path = os.path.join(langchain_base_path, index_name, company_folder)
            chunked_docs = build_faiss_for_company_langchain(company_path, output_path)
            global_docs.extend(chunked_docs)

    # Build global DB
    if global_docs:
        print(f"[LangChain] Building GLOBAL_DB from {len(global_docs)} chunks.")
        db, bm25 = compute_embeddings(global_docs)
        global_db_path = os.path.join(langchain_base_path, "GLOBAL_DB")
        save_faiss_and_bm25(db, bm25, global_db_path)
        print(f"[LangChain] GLOBAL_DB => '{global_db_path}'.")
    else:
        print("[LangChain] No chunks found, skipping global DB.")

###############################################################################
#                            MAIN EXECUTION
###############################################################################
if __name__ == "__main__":
    """
    Main entry point:
    1) Build Docling-based indexes (company-level + global).
    2) Build LangChain-based indexes (company-level + global).
    Both sets of indexes are stored in `db_storage_path` under:
       - docling/
       - langchain/
    """
    with EmissionsTracker(project_name="Climate_Finance_Bench_vector_store_generation", output_dir="/kaggle/working/") as tracker:
        
        print("[Pipeline] Building DOCLING-based indexes...")
        build_all_faiss_stores_docling(
            target_directory=target_directory,
            db_storage_path=db_storage_path,
            indexes=INDEXES
        )
        print("[Pipeline] DOCLING-based done.\n")
    
        print("[Pipeline] Building LANGCHAIN-based indexes...")
        build_all_faiss_stores_langchain(
            target_directory=target_directory,
            db_storage_path=db_storage_path,
            indexes=INDEXES
        )
        print("[Pipeline] LANGCHAIN-based done.\n")
    
        print("[DONE] Both indexing methods complete.")


In [None]:
from IPython.display import FileLink

!zip -r file.zip /kaggle/working/
FileLink(r'file.zip')

In [None]:
!pip freeze