### Importing Required Libraries

In [13]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import json

from dotenv import load_dotenv

##### Loading Environment Variables

In [14]:
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

##### Initializing the Embedding Model

In [15]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001",
    google_api_key=os.getenv("GOOGLE_API_KEY"))

##### Loading the Corpus

In [16]:
loader = TextLoader("corpus.txt",encoding="utf-8")
documents = loader.load()

##### Splitting Documents into Chunks

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=5)

##### Creating the Vector Store (ChromaDB)

In [6]:
docs = text_splitter.split_documents(documents)

base_path = os.getcwd()
persist_path = os.path.join(base_path, "chroma_db")

vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory= persist_path)

###### Retriever Configuration (Similarity Search with Top-K = 5)

In [7]:
retriever = vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":5})

###### Retriever Configuration (MMR Search with k=3 and lambda_mult=0.1)

In [None]:
# retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 3,"lambda_mult":0.1})

##### Initializing the Language Model (LLM)

In [9]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",
google_api_key=os.getenv("GOOGLE_API_KEY"),temperature=0.9)

###### Query and Context Retrieval

In [None]:
query = "When is the annual academic meeting?"
context = "\n".join([doc.page_content for doc in retriever.invoke(query)])
prompt = f"""
Answer the question using only the context below.

Context:
{context}

Question:
{query}
"""
response = llm.invoke(prompt)
print(F"Question: {query}")
print("\nFinal Answer:")
print(response.content)
context.split("\n")

Question: When is the annual academic meeting?

Final Answer:
The annual academic meeting for 2023 was held. The annual academic meeting for 2024 is scheduled.


['The annual academic meeting for 2023 was held on',
 'The annual academic meeting for 2024 is scheduled',
 'The Annual Research Symposium is held every June.']

##### Run All Questions at Once and Save Results as JSON

In [None]:
questions = [
"When is the annual academic meeting?",
"When is the annual research event in June?",
"How many credits are required to graduate?",
"What time does the cafeteria start serving breakfast?",
"Are late thesis proposals accepted?",
"What is the student parking fee?",
"Do students need 24 credits to graduate?",
"When was the June meeting held in 2023?",
"When did the cafeteria open at 9:00 AM?",
"What are the current graduation credit requirements?",
"Is parking free for university members?",
"When does the fall semester begin?"
]

results = []

for query in questions:
    
    docs = retriever.invoke(query)
    retrieved_chunks = [doc.page_content for doc in docs]
    context = "\n".join(retrieved_chunks)

    prompt = f"""
Answer the question using only the context below.

Context:
{context}

Question:
{query}
"""
    response = llm.invoke(prompt)
    results.append({
        "question": query,
        "retrieved_chunks": retrieved_chunks,
        "answer": response.content.strip()
    })

base_path = os.getcwd()
output_path = os.path.join(base_path, "outputs")
os.makedirs(output_path, exist_ok=True)

file_path = os.path.join(output_path, "rag_results.json")

with open(file_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"Saved at: {file_path}")

Saved at: d:\RAG Failure Analysis\outputs\rag_results_mmr.json
