# Ingestion Pipeline

## Set up Environment

In [2]:
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

True

## Load Requiered Libs

In [None]:
from src.services.ingestion_functions import *

## PDF to Markdown conversion (Docling)

In [None]:
# Use the function
pdf_path = ""  # Replace with the path to your PDF file
markdown_path = ''  # Replace with the desired output path
convert_pdf_to_markdown(pdf_path, markdown_path)

In [None]:
# Use the function
pdf_path = ""  # Replace with the path to your PDF file
markdown_path = ''  # Replace with the desired output path
convert_pdf_to_markdown(pdf_path, markdown_path)

## Chunking Phase

### Obtain the documents form the Markdown files (LangChain)

In [1]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
import uuid

In [4]:
# Define headers to split on and their corresponding metadata keys
headers_to_split_on = [
    ("#", "Title"),
    ("##", "Section"),
    ("###", "Subsection"),
    ("####", "Figure/Table/SupplementaryTable")
]

# Initialize the Markdown header splitter with the specified headers
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [None]:
markdown_folder = ""
markdown_documents = read_markdown_files(markdown_folder)

In [None]:
processed_chunks = []
for doc in markdown_documents:
    chunks = markdown_splitter.split_text(doc["content"])
    # Add unique ID and file name as metadata to each chunk for traceability
    for chunk in chunks:
        chunk.metadata["source_file"] = doc["file_name"]
        chunk.metadata["id"] = str(uuid.uuid4())  # Generate a unique ID for each chunk
        processed_chunks.append(chunk)

# Each 'chunk' is a Document object with 'page_content' and 'metadata'
for chunk in processed_chunks:
    print(f"Chunk ID: {chunk.metadata['id']}")
    print(f"Source File: {chunk.metadata['source_file']}")
    print(f"Metadata: {chunk.metadata}")
    print(f"Content: {chunk.page_content[:100]}...")  # Print the first 100 characters of the content
    print("-" * 50)


In [7]:
# Update chunks' metadata with cross-references
for chunk in processed_chunks:
    references = find_references(chunk.page_content)
    if references:
        chunk.metadata['References'] = references

In [8]:
cleaned_chunks = remove_duplicate_references(processed_chunks)

### Ingestion

In [17]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [None]:
# Define embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")  # Replace with your model

# Prepare documents for storage
documents = [chunk for chunk in cleaned_chunks]
ids = [chunk.metadata["id"] for chunk in cleaned_chunks]  # Ensure each chunk has a unique ID

# Define collection and persistence directory
collection_name = "" # Replace with your desired collection name
persist_directory = "" # Replace with your desired directory for persistence

# Create Chroma vectorstore from documents
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    ids=ids,
    collection_name=collection_name,
    persist_directory=persist_directory
)