# Requirements
langchain-google-genai \
langchain \
langchain-chroma \
langchainhub \
dotenv 

# Import Dependencies

In [5]:
# Basic libraries
import os
from dotenv import load_dotenv

# Document Conversion libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Vector Store
from langchain_chroma import Chroma

# LLM libraries
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

# extras
from langchain_core.runnables import RunnablePassthrough
from langchain_classic import hub

In [9]:
# Load the required Variables
load_dotenv()

if not os.getenv("GOOGLE_API_KEY"):
    raise KeyError("Please Create a .env file with GOOGLE_API_KEY set up.")

# Data Ingestion Pipeline

<h3>Includes:</h3>

- Document processing
- Data extraction from documents
- Chunking
- Storing in VectorDB

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

def load_documents(file_paths):
    all_text = []
    for file in file_paths:
        elements = partition(filename=file)
        text_elements = [element.text for element in elements]
        all_text.append("\n\n".join(text_elements))
        
    print(all_text)
    return "\n\n".join(all_text)

def split_text(text: str):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000,
        chunk_overlap=150)    # change as required
    # print(text_splitter)
    return text_splitter.split_text(text)

def get_vectorstore(chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
    vector_store = Chroma(
        collection_name="simplerag",
        embedding_function=embeddings,
        persist_directory="./vector_stores/chroma_langchain_db",  # Where to save data locally, remove if not necessary
    )
    return vector_store

# Format retrieved documents into a single string
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define Chain for RAG

In [13]:
prompt = hub.pull("rlm/rag-prompt")
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

def rag_chain(vectorstore, question):
    qa_chain = (
        {
            "context": vectorstore.as_retriever() | format_docs,
            "question": RunnablePassthrough(),
        } | prompt | llm | StrOutputParser()
    )
    return qa_chain.invoke(question)

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.