In [1]:
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# --- Configuration ---
DATA_PATH = "data" 
CHROMA_DB_PATH = "chroma_db" 
EMBEDDING_MODEL = "lokeshch19/ModernPubMedBERT"

def main():
    print("Starting data ingestion...")

    if os.path.exists(CHROMA_DB_PATH):
        print(f"Removing old database at {CHROMA_DB_PATH}")
        shutil.rmtree(CHROMA_DB_PATH)

    # 1. Load Documents
    print(f"Loading documents from {DATA_PATH}...")
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    if not documents:
        print("No documents found. Please add your PDFs to the 'data' folder.")
        return

    print(f"Loaded {documents} documents.")

    # 2. Chunk Documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)
    print(f"Split documents into {len(chunks)} chunks.")

    # 3. Initialize Embedding Model
    print(f"Loading embedding model: {EMBEDDING_MODEL}...")
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={'device': 'cuda'} # Use 'cuda' if you have a GPU
    )

    print(f"Creating vector store at {CHROMA_DB_PATH}...")
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_DB_PATH
    )

    print("\n✅ Ingestion complete.")
    print(f"Vector store created at: {CHROMA_DB_PATH}")
    print(f"Total chunks processed: {len(chunks)}")


In [2]:
main()

Starting data ingestion...
Removing old database at chroma_db
Loading documents from data...
Loaded [Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2019-04-24T16:58:44+02:00', 'moddate': '2019-04-24T16:58:45+02:00', 'trapped': '/False', 'source': 'data\\biomedicine-prevention.0000000C2.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='©Biomedicine & Prevention 2019\n1\nAnaemia and Prolonged Length of Stay: \nA Retrospective Analysis of a 1-Year Cohort of Inpatients\nM. Colafelice,1 R. Mastrosanti,2 M. Ciabattini,2 M. Maurici,2 F. Lauria,1 L. Morciano,2 F. Lucaroni2\n1 Sant’Eugenio Hospital\n2 University of Rome Tor Vergata\nIntroduction\nAnaemia is a widespread disease, globally affecting almost 2 \nbillion individuals and therefore being considered an important \nhealth burden.\n1 Mild, moderate and severe anaemia together de-\ntermine over 60 million years of life lived with disability (YLDs)