In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.documents import Document

from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

In [3]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    embedding_kwargs={"output_dimensionality": 3072})

In [4]:
# documents = [
#     Document(page_content="The university library opens at 8:00 AM and closes at 6:00 PM on weekdays."),
#     Document(page_content="Starting from January 2025, the library closes at 8:00 PM on weekdays."),
#     Document(page_content="The gym allows members to use facilities for 2 hours per day."),
#     Document(page_content="As of March 2025, the gym usage limit has been increased to 3 hours per day."),
#     Document(page_content="The annual academic meeting is scheduled for June 10, 2025."),
#     Document(page_content="Last year, the annual academic meeting was held on June 15, 2024."),
#     Document(page_content="Students must submit assignments within 7 days after the deadline to avoid penalty."),
#     Document(page_content="A revised rule in 2025 states that assignments submitted after 5 days will receive a penalty."),
#     Document(page_content="The cafeteria serves breakfast from 7:00 AM to 10:00 AM."),
#     Document(page_content="Due to staff shortage in February 2025, breakfast starts at 8:00 AM temporarily."),
#     Document(page_content="Computer Lab A contains 40 computers."),
#     Document(page_content="Computer Lab B contains 35 computers."),
#     Document(page_content="The university introduced a new AI course in 2025."),
#     Document(page_content="The AI course focuses on machine learning and neural networks."),
#     Document(page_content="Before 2025, the AI course mainly covered rule-based systems."),
#     Document(page_content="The parking fee is $50 per semester."),
#     Document(page_content="As of 2025, the parking fee increased to $70 per semester."),
#     Document(page_content="The student health center opens from 9:00 AM to 4:00 PM."),
#     Document(page_content="During exam week, the health center remains open until 6:00 PM."),
#     Document(page_content="The total number of enrolled students in 2024 was 2,000.")
# ]

In [5]:
loader = TextLoader("Questions.txt",encoding="utf-8")
documents = loader.load()

In [None]:
df_path =r"D:\RAG Failure Analysis\chroma_db"
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=85,
    chunk_overlap=5)
docs = text_splitter.split_documents(documents)
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory= os.path.join(df_path, "chroma_db2"))
retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 3,"lambda_mult":0.3})


In [7]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    google_api_key=google_api_key)

query = "When does the cafeteria start serving breakfast?"
context = "\n".join([doc.page_content for doc in retriever.invoke(query)])

prompt = f"""
Answer the question using only the context below.

Context:
{context}

Question:
{query}
"""
response = llm.invoke(prompt)
print(F"Question: {query}")
print("\nFinal Answer:")
print(response.content)
context.split("\n")


Question: When does the cafeteria start serving breakfast?

Final Answer:
The cafeteria starts serving breakfast at 8:00 AM temporarily due to staff shortage in February 2025.


['The cafeteria serves breakfast from 7:00 AM to 10:00 AM.',
 'The cafeteria serves breakfast from 7:00 AM to 10:00 AM.',
 'Due to staff shortage in February 2025, breakfast starts at 8:00 AM temporarily.']