In [20]:
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

True

In [21]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [22]:
from langchain_groq import ChatGroq
llm = ChatGroq(model="qwen/qwen3-32b")

In [23]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [24]:
file_path = os.path.join(os.getcwd(), "data", "AuroraDynamics.pdf")
loader = PyPDFLoader(file_path)
documents = loader.load()

print(f"Loaded {len(documents)} documents")

Loaded 6 documents


In [25]:
# Split the documents into chunks. To preserve the context, we use a chunk size of 500 characters and an overlap of 150 characters.
# This means that each chunk will have 500 characters, and the next chunk will start 150 characters after the start of the previous chunk.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, 
                                               chunk_overlap=150, 
                                               length_function=len)


docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

#print(f"Document Metadata: {docs[0].metadata}...") 

# Display the first 500 characters of the first chunk
#print(f"First chunk content: {docs[0].page_content[:500]}...") 

Split into 21 chunks


In [None]:
from langchain.vectorstores import FAISS

#embeddings_model.embed_documents(docs[0].page_content)

#In Memory - FAISS, croma
#On Disk - Chroma, Weaviate, Pinecone, Qdrant, Milvus, Redis
#Cloud - Pinecone, Weaviate, Qdrant, Milvus, Redis

# In memory vector store
# This will create a FAISS vector store in memory using the embeddings model
vectorstore = FAISS.from_documents(docs, embeddings_model)

# 1. Data retrieval
relevant_docs = vectorstore.similarity_search("Top Clients", k=3)
#print(relevant_docs[0].page_content)

# 2. Generation Pipeline
retriever = vectorstore.as_retriever(k=5)
retriever.invoke("key products and services")

[Document(id='ea44aded-843d-40f8-b0fd-0c6050afd969', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-14T19:24:03-05:00', 'author': 'Shiraj Shaikh', 'moddate': '2025-07-14T19:24:03-05:00', 'source': 'd:\\Data\\Projects\\document_portal\\notebook\\data\\AuroraDynamics.pdf', 'total_pages': 6, 'page': 3, 'page_label': '4'}, page_content='energy. \n \nTop Clients \n• Walmart Global Energy Division \n• Tesla Gigafactories \n• Procter & Gamble \n• Shell Energy \n• General Motors \n \nSales Turnover (Year-wise) \n• 2014: $250K \n• 2016: $1.8M \n• 2018: $12M \n• 2020: $14M \n• 2022: $50M \n• 2025: $200M'),
 Document(id='f92bbff9-1338-4b1a-a9bf-fceecb3ba3e0', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-14T19:24:03-05:00', 'author': 'Shiraj Shaikh', 'moddate': '2025-07-14T19:24:03-05:00', 'source': 'd:\\Data\\Projects\\document_portal\\notebook\\data\\AuroraDynamics.pdf', 'total

In [27]:

from langchain.prompts import PromptTemplate

prompt_template = """
You are a helpful assistant. Use the following pieces of context to answer the question at the end.
If you dont know the answer, just say that you don't know. Do not try to make up an answer.
Contect : {context}
Question: {question}
Answer:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])


In [28]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [None]:
from langchain_core.output_parsers import StrOutputParser

# Collect a question at runtime and pass it to RunnablePassthrough
from langchain_core.runnables import RunnablePassthrough

parser = StrOutputParser()

# LCEL Format - Langcahin Expression Language
# Define a RAG Chain
rag_chain =({"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | parser)


# Question passed to RunnablePassthrough
#rag_chain.invoke("Who are the top clients of Aurora Dynamics?")
rag_chain.invoke("Who is the founder of Aurora Dynamics?")

"<think>\nOkay, let's see. The user is asking who the founder of Aurora Dynamics is. I need to look through the provided context to find the answer.\n\nLooking at Chapter 2: The Birth of Aurora Dynamics, it mentions that Ethan quit his job in October 2012 and formed Aurora Dynamics, Inc. in January 2013. The name Aurora symbolized a new dawn of energy efficiency. The company's mission is stated there. \n\nAlso, in Chapter 1: The Spark of an Idea, it refers to a young software engineer named Ethan Parker who was sitting in a coffee shop in Austin, Texas, in the summer of 2012. He had worked in tech companies for eight years, focusing on enterprise software solutions. \n\nPutting this together, the founder is Ethan Parker. The context clearly states that he started the company. There's no mention of any other founder or co-founder in the provided text. The other chapters talk about the company's growth and clients but don't mention other founders. So the answer should be Ethan Parker.\n<