In [19]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma  # <-- Import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma


# Load environment variables
load_dotenv()
persist_directory = "db_gemini_chroma"

# 1. Load the data
loader = TextLoader("./manageMoney.txt", encoding='utf-8')
documents = loader.load()

In [24]:
# 2. Split the documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
print(f"Split your document into {len(chunks)} chunks.")
# --- The rest of the chain is identical ---
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # Adjust chunk size as needed
    chunk_overlap=200,    # Adjust overlap for better context
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks.")

Split your document into 108 chunks.
Created 113 chunks.


In [32]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=os.getenv("GEMINI_API_KEY"))

In [33]:
vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

# Define your batch size to match the API limit (100 for Gemini embeddings)
batch_size = 90 # Use a slightly smaller batch size than 100 to be safe

# Loop through your chunks and add them in batches
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]

    # Extract page_content from Document objects in the batch
    batch_texts = [doc.page_content for doc in batch]
    batch_metadatas = [doc.metadata for doc in batch]

    try:
        vectorstore.add_texts(
            texts=batch_texts,
            metadatas=batch_metadatas
        )
        print(f"Added batch {i//batch_size + 1}/{len(chunks)//batch_size + 1} ({len(batch)} documents).")
        # Add a small delay between batches to help with rate limits
        time.sleep(1) # Sleep for 1 second
    except Exception as e:
        print(f"Error adding batch {i//batch_size + 1}: {e}")
        # Consider adding a longer sleep or retry logic here
        time.sleep(5) # Sleep longer on error

# 4. Persist the database explicitly after adding all documents
vectorstore.persist()
print("Vector store created and persisted successfully!")

Error adding batch 1: Error embedding content: 400 API key not valid. Please pass a valid API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key not valid. Please pass a valid API key."
]
Error adding batch 2: Error embedding content: 400 API key not valid. Please pass a valid API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key not valid. Please pass a valid API key."
]
Vector store created and persisted successfully!


In [38]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI

# 1. Load your API key from a .env file
# Create a file named .env and add the line:
# GOOGLE_API_KEY="YOUR_API_KEY_HERE"
load_dotenv()

# Check if the API key is loaded
if not os.getenv("GEMINI_API_KEY"):
    raise ValueError("Google API key not found. Please set it in your .env file.")

# 2. Initialize the model
# Using "gemini-1.5-flash" is a good, fast choice for testing.
try:
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

    # 3. Invoke the model with a question
    print("Sending a test prompt to the Gemini API...")
    result = llm.invoke("What is the capital of the Netherlands?")

    # 4. Print the response content
    # The result is an AIMessage object; you access the text with .content
    print("\nResponse from Gemini:")
    print(result.content)

except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.


In [36]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001",google_api_key=os.getenv("GEMINI_API_KEY"))
vectorindex_gemini = Chroma.from_documents(documents=chunks, embedding=embeddings)

GoogleGenerativeAIError: Error embedding content: 400 API key not valid. Please pass a valid API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key not valid. Please pass a valid API key."
]

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Assume 'texts' is your list of documents to embed
# and 'embeddings' is your initialized GoogleGenerativeAIEmbeddings object.

# The fix is to add batch_size=100 to the call
embedded_docs = embeddings.embed_documents(
    loader0, 
    batch_size=100
)

print("Successfully created embeddings!")

TypeError: object of type 'TextLoader' has no len()