In [None]:
# Python - 3.13.7
# Dependencies - langchain-community, langchain-text-splitters, langchain-milvus, langchain-huggingface, langchain-qdrant, chromadb, faiss-cpu, pypdf, langchain, sentence-transformers

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_milvus import Milvus
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma

In [2]:
folder_path = "./Netsuite_pdfs/"

In [None]:
# Parsing pdfs, not needed if parsed_docs.json present

# loader = DirectoryLoader(
#     path=folder_path,
#     glob='*.pdf',
#     loader_cls=PyPDFLoader
# )

# docs = loader.lazy_load()

# pages = []
# async for page in loader.alazy_load():
#     pages.append(page)

In [None]:
# Functions for saving and loading parsed documents

import json


def save_documents(docs, filename="docs.json"):
    data = [{"page_content": d.page_content, "metadata": d.metadata}
            for d in docs]
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def load_documents(filename="docs.json"):
    from langchain.schema import Document
    import json
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [Document(page_content=d["page_content"], metadata=d["metadata"]) for d in data]

In [None]:
# not needed if parsed_docs.json present

# save_documents(pages, "parsed_docs.json")

In [4]:
pages = load_documents("parsed_docs.json")

In [8]:
len(pages)

32388

In [None]:
# Chunk size set due to sentence-transformers/all-mpnet-base-v2 constraints

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1400, chunk_overlap=150)
texts = text_splitter.split_documents(pages)

In [6]:
len(texts)

71702

In [2]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


## Milvus DB

In [None]:
URI = "./milvus_example.db"

milvus_vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": URI},
    index_params={"index_type": "FLAT", "metric_type": "L2"},
)

In [None]:
milvus_vector_store.add_documents(texts)

In [None]:
milvus_retriever = milvus_vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 10, "lambda_mult": 0.8})

In [None]:
milvus_retriever.invoke(
    "What are the standard and specialized NetSuite centers?")

## Qdrant

In [None]:
# No need to run if db is already present

client = QdrantClient(path="./tmp/langchain_qdrant")

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

qdrant_vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
)

qdrant_vector_store.add_documents(texts)

True

In [None]:
# Loading saved database

# client = QdrantClient(path="./tmp/langchain_qdrant")


# qdrant_vector_store = QdrantVectorStore(
#     client=client,
#     collection_name="demo_collection",
#     embedding=embeddings,
# )

In [None]:
qdrant_retriever = qdrant_vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 10, "lambda_mult": 0.8})


In [None]:
qdrant_retriever.invoke(
    "How do I set up commission calculations for sales reps?")

## Faiss

In [None]:
faiss_vector_store = FAISS.from_documents(texts, embeddings)

In [None]:
faiss_vector_store.save_local("faiss_index")

In [None]:
faiss_retriever = faiss_vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 10, "lambda_mult": 0.8})

In [None]:
faiss_retriever.invoke(
    "How do I set up commission calculations for sales reps?")

## Chroma

In [None]:
chroma_vector_store = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [None]:
chroma_retriever = chroma_vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 10, "lambda_mult": 0.8})

In [None]:
chroma_retriever.invoke(
    "How do I set up commission calculations for sales reps?")