In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import time # Import time for adding delays

# --- Configuration ---
# Replace with your actual API key or ensure it's in your environment variables
os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY"
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=os.getenv("GEMINI_API_KEY"))
persist_directory = "./chroma_db_financial_guide" # Define where to save your vectorstore

# --- 1. Load the Document ---
print("Loading document...")
loader = TextLoader("your_financial_guide.txt") # Replace with your actual file path
documents = loader.load()
print(f"Loaded {len(documents)} document(s).")

# --- 2. Split into Chunks ---
print("Splitting document into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # Adjust chunk size as needed
    chunk_overlap=200,    # Adjust overlap for better context
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks.")

# --- 3. Initialize Chroma and Add Documents in Batches ---
print("Initializing Chroma DB and adding documents in batches...")

# Initialize an empty Chroma vector store first.
# Pass the embedding function and persist directory at creation.
# If the directory already exists and contains a Chroma DB, this will load it.
vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

# Define your batch size to match the API limit (100 for Gemini embeddings)
batch_size = 90 # Use a slightly smaller batch size than 100 to be safe

# Loop through your chunks and add them in batches
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]

    # Extract page_content from Document objects in the batch
    batch_texts = [doc.page_content for doc in batch]
    batch_metadatas = [doc.metadata for doc in batch]

    try:
        vectorstore.add_texts(
            texts=batch_texts,
            metadatas=batch_metadatas
        )
        print(f"Added batch {i//batch_size + 1}/{len(chunks)//batch_size + 1} ({len(batch)} documents).")
        # Add a small delay between batches to help with rate limits
        time.sleep(1) # Sleep for 1 second
    except Exception as e:
        print(f"Error adding batch {i//batch_size + 1}: {e}")
        # Consider adding a longer sleep or retry logic here
        time.sleep(5) # Sleep longer on error

# 4. Persist the database explicitly after adding all documents
vectorstore.persist()
print("Vector store created and persisted successfully!")