In [1]:
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [3]:
from langchain_groq import ChatGroq
llm = ChatGroq(model="qwen/qwen3-32b")

In [4]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [5]:

# Load a single PDF document
#file_path = os.path.join(os.getcwd(), "data", "AuroraDynamics.pdf")
#loader = PyPDFLoader(file_path)
#documents = loader.load()
#print(f"Loaded {len(documents)} documents")


In [6]:
# load all the documents from the directory
data_directory = os.path.join(os.getcwd(), "data")
pdf_files = [f for f in os.listdir(data_directory) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files in the data directory: {pdf_files}")

# Load all documents from the directory
documents = []
for pdf_file in pdf_files:
    file_path = os.path.join(data_directory, pdf_file)
    loader = PyPDFLoader(file_path)
    documents.extend(loader.load())

Found 10 PDF files in the data directory: ['AuroraDynamics.pdf', 'DaawatBBQ.pdf', 'FireSafety_2024.pdf', 'FireSafety_Partial_Requirements.pdf', 'GraphQL.pdf', 'Personal-Fitness.pdf', 'StyleCop.pdf', 'WebParts.pdf', 'What is Agile.pdf', 'XML.pdf']


In [7]:
# Split the documents into chunks. To preserve the context, we use a chunk size of 500 characters and an overlap of 150 characters.
# This means that each chunk will have 500 characters, and the next chunk will start 150 characters after the start of the previous chunk.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, 
                                               chunk_overlap=150, 
                                               length_function=len)


docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

#print(f"Document Metadata: {docs[0].metadata}...") 

# Display the first 500 characters of the first chunk
#print(f"First chunk content: {docs[0].page_content[:500]}...") 

Split into 578 chunks


In [8]:
from langchain.vectorstores import FAISS

#embeddings_model.embed_documents(docs[0].page_content)

#In Memory - FAISS, croma
#On Disk - Chroma, Weaviate, Pinecone, Qdrant, Milvus, Redis
#Cloud - Pinecone, Weaviate, Qdrant, Milvus, Redis

# In memory vector store
# This will create a FAISS vector store in memory using the embeddings model
vectorstore = FAISS.from_documents(docs, embeddings_model)

# 1. Data retrieval
#relevant_docs = vectorstore.similarity_search("Top Clients", k=3)
#print(relevant_docs[0].page_content)

# 2. Generation Pipeline
retriever = vectorstore.as_retriever(k=5)
#retriever.invoke("key products and services")

In [9]:

from langchain.prompts import PromptTemplate

prompt_template = """
You are a helpful assistant. Use the following pieces of context to answer the question at the end.
If you dont know the answer, just say that you don't know. Do not try to make up an answer.
Contect : {context}
Question: {question}
Answer:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])


In [10]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [11]:
from langchain_core.output_parsers import StrOutputParser

# Collect a question at runtime and pass it to RunnablePassthrough
from langchain_core.runnables import RunnablePassthrough

parser = StrOutputParser()

# LCEL Format - Langcahin Expression Language
# Define a RAG Chain
rag_chain =({"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | parser)


# Question passed to RunnablePassthrough
#rag_chain.invoke("Who are the top clients of Aurora Dynamics?")
#rag_chain.invoke("Who is the founder of Aurora Dynamics?")

In [12]:
rag_chain.invoke("What is XP?")

'<think>\nOkay, the user is asking, "What is XP?" I need to answer this based on the provided context.\n\nLooking at the context, the first thing mentioned is that XP stands for Extreme Programming, an agile software development methodology. The core values are Communication, Rapid feedback, Keep it simple, and I think there\'s a fourth one, but the context only lists three here. Wait, the original mention says four core values, but the given text only shows three points. Let me check again.\n\nThe user\'s context starts with Agile Development Methodology and then talks about XP. The note says XP has four core values and fourteen principles. The listed core values under the context are:\n\n1. Communication\n2. Rapid feedback\n3. Keep it simple\n\nWait, the user\'s context only lists three core values here. The initial statement mentions four, but the text only shows three. Maybe there\'s a typo or the fourth one is cut off. But in the provided text, the user\'s context only shows three