In [2]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("data")
documents = loader.load()

In [3]:
len(documents)

23

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  
    chunk_overlap=200,  
    add_start_index=True,  
)
all_splits = text_splitter.split_documents(documents)

print(f"Split blog post into {len(all_splits)} sub-documents.")

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
    embedding_function=embeddings,
    persist_directory='my_chroma_db',
    collection_name='sample'
)

In [None]:
def clean_text(text: str) -> str:
    # Remove or replace invalid surrogate characters
    return text.encode("utf-8", "ignore").decode("utf-8", "ignore")

# Clean all page_content
for doc in all_splits:
    doc.page_content = clean_text(doc.page_content)

# Now add safely
vector_store.add_documents(all_splits)