In [1]:
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma


DATA_PATH = "data" 
CHROMA_DB_PATH = "chroma_db" 
EMBEDDING_MODEL = "lokeshch19/ModernPubMedBERT"

def main():
    print("Starting data ingestion...")

    if os.path.exists(CHROMA_DB_PATH):
        print(f"Removing old database at {CHROMA_DB_PATH}")
        shutil.rmtree(CHROMA_DB_PATH)

    # 1. Load Documents
    print(f"Loading documents from {DATA_PATH}...")
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    if not documents:
        print("No documents found.")
        return

    print(f"Loaded {len(documents)} documents.")

    # 2. Chunk Documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)
    print(f"Split documents into {len(chunks)} chunks.")

    # 3. Initialize Embedding Model
    print(f"Loading embedding model: {EMBEDDING_MODEL}...")
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={'device': 'cuda'} 
    )

    print(f"Creating vector store at {CHROMA_DB_PATH}...")
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_DB_PATH
    )

    print("\nIngestion complete.")
    print(f"Vector store created at: {CHROMA_DB_PATH}")
    print(f"Total chunks processed: {len(chunks)}")


In [2]:
main()

Starting data ingestion...
Removing old database at chroma_db
Loading documents from data...
Loaded 38 documents.
Split documents into 239 chunks.
Loading embedding model: lokeshch19/ModernPubMedBERT...
Creating vector store at chroma_db...

Ingestion complete.
Vector store created at: chroma_db
Total chunks processed: 239
