In [1]:
import os
import shutil
from pathlib import Path
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader


In [2]:
def remove_chroma_directory(directory: str) -> None:
    """Remove the existing Chroma directory if it exists."""
    if os.path.exists(directory):
        shutil.rmtree(directory)
        print(f"Removed existing Chroma directory: {directory}")
    else:
        print(f"No existing Chroma directory found at: {directory}")

In [3]:
def get_embedding_model(model_name: str = "llama3.2"):
    """Get an embedding model. Default: Ollama Embeddings."""
    print(f"Loading embedding model: {model_name}")
    return OllamaEmbeddings(model=model_name)

In [4]:
def load_documents(docs_path: str, exclude_patterns=None) -> list:
    if exclude_patterns is None:
        exclude_patterns = [".DS_Store", ".ipynb_checkpoints"]

    base_path = Path(docs_path)
    all_files = base_path.rglob("*")

    documents = []
    for file in all_files:
        if not file.is_file():
            continue

        if any(file.match(pattern) for pattern in exclude_patterns):
            continue

        # Decide loader by extension
        if file.suffix.lower() == ".pdf":
            print(f"Loading PDF: {file}")
            loader = PyPDFLoader(str(file.absolute()))
        else:
            print(f"Loading text: {file}")
            # Fallback to TextLoader
            loader = TextLoader(str(file.absolute()), encoding='utf-8')

        documents.extend(loader.load())

    print(f"Total documents loaded: {len(documents)}")
    return documents


In [5]:
def chunk_documents(documents: list, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
    """
    Splits documents into smaller chunks for better retrieval.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunked_docs = []
    for doc in documents:
        # split_documents() expects a list of Documents.
        doc_chunks = splitter.split_documents([doc])
        chunked_docs.extend(doc_chunks)

    print(f"Total documents after chunking: {len(chunked_docs)}")
    return chunked_docs

In [6]:
def build_vectorstore(docs: list, embeddings, persist_dir: str, collection: str):
    """Build and persist a Chroma vectorstore from documents."""
    db = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persist_dir,
        collection_name=collection
    )
    print(f"Chroma DB persisted at: {persist_dir}")
    return db

In [None]:
def main():
    # 1) Remove old Chroma data
    remove_chroma_directory('./chroma')

    # 2) Get embedding model
    embedding_llm = get_embedding_model("llama3.2")

    # 3) Load documents
    raw_documents = load_documents("./data")

    # 4) Chunk documents
    chunked_documents = chunk_documents(raw_documents, chunk_size=512, chunk_overlap=128)

    # 5) Build and persist vectorstore
    db = build_vectorstore(
        docs=chunked_documents,
        embeddings=embedding_llm,
        persist_dir="./chroma",
        collection="vermac-support"
    )


if __name__ == "__main__":
    main()

No existing Chroma directory found at: ./chroma
Loading embedding model: llama3.2
Loading PDF: data\products\bpcms-1500-pro-series-product-sheet-ver-mac-003.pdf
Loading text: data\products\pcms.txt
Total documents loaded: 3
Total documents after chunking: 11
Chroma DB persisted at: ./chroma
