# Ask Questions From PDF

### Model Used: *llama3.2:1b*

In [None]:
# Firstly install these libraries and modules if you don't have them. 
#!pip install langchain==0.0.234 langchain-core==0.0.15 langchain-community==0.0.7 langchain-ollama==0.0.4 tiktoken==0.3.3 faiss-cpu==1.7.4 python-dotenv==1.0.0 pymupdf==1.23.3

## Import necessary libraries and modules

In [1]:
import os  # Provides functions for interacting with the operating system, like file handling, environment variables, etc.
import warnings  # For warning handling
from dotenv import load_dotenv  # To load environment variables from .env file
from langchain_community.document_loaders import PyMuPDFLoader  # For loading documents from PDF using PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter  # For splitting documents into chunks
import tiktoken  # For tokenization, useful for LLMs
from langchain_ollama import OllamaEmbeddings  # For embedding models from Ollama
import faiss  # For FAISS-based vector search
from langchain_community.vectorstores import FAISS  # For using FAISS as a vector store
from langchain_community.docstore.in_memory import InMemoryDocstore  # For in-memory document storage
from langchain import hub  # To interact with LangChain's hub
from langchain_core.output_parsers import StrOutputParser  # For parsing string outputs from LLMs
from langchain_core.runnables import RunnablePassthrough  # For running data through without modification
from langchain_core.prompts import ChatPromptTemplate  # For creating chat prompts
from langchain_ollama import ChatOllama  # For using Ollama-based chat models

## Set up environment and suppress warnings

In [2]:
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

True

## Load PDF documents from the dataset directory

In [3]:
def load_pdfs(directory):
    pdfs = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf'):
                pdfs.append(os.path.join(root, file))
    return pdfs

## Load all PDFs and extract their pages

In [4]:
pdf_paths = load_pdfs('Data')
docs = []
for pdf_path in pdf_paths:
    loader = PyMuPDFLoader(pdf_path)
    docs.extend(loader.load())

## Split the documents into smaller chunks

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)

## Tokenize the content for validation

In [6]:
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
print(len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content)))

241 240


## Generate embeddings and initialize FAISS vector store

In [7]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
index = faiss.IndexFlatL2(len(embeddings.embed_query("example text")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

## Add documents to the vector store

In [8]:
ids = vector_store.add_documents(documents=chunks)

## Retrieve documents using similarity or MMR

In [9]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={
    'k': 3,
    'fetch_k': 100,
    'lambda_mult': 1
})

## Define a custom RAG pipeline

In [10]:
model = ChatOllama(model="llama3.2:1b", base_url="http://localhost:11434")

## Define the prompt for RAG-based question answering

In [11]:
prompt_template = """
You are a knowledgeable and concise assistant for answering questions. Use the provided context below to answer the question as accurately as possible.
- Base your answer strictly on the context given; do not include information from outside the provided context.
- If the context does not provide an answer, clearly state that you do not know.
Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Ask a question and get the output

In [16]:
question = "what is the price of Web Design?"


print("Processing...\n")
output = rag_chain.invoke(question)
print("Answer:",output,"\n")

Processing...

Answer: The price of Web Design is $85.00, with a total due of $93.50 and an additional tax charge of $8.50, making the subtotal $93.50 + $8.50 = $102.00. 



In [21]:
question = "what is in the pdf?"


print("Processing...\n")
output = rag_chain.invoke(question)
print("Answer:",output,"\n")

Processing...

Answer: The PDF appears to be an invoice, and the information it contains includes:

- Invoice number: INV-3337
- Order number: 12345
- Invoice date: January 25, 2016
- Due date: January 31, 2016
- Total due: $93.50 



In [22]:
question = "what is Order Number?"


print("Processing...\n")
output = rag_chain.invoke(question)
print("Answer:",output,"\n")

Processing...

Answer: The Order Number is 12345. 



In [26]:
question = "what is Invoice Number?"


print("Processing...\n")
output = rag_chain.invoke(question)
print("Answer:",output,"\n")

Processing...

Answer: The Invoice Number is shown as INV-3337 on the provided invoice. 



In [23]:
question = "what is Invoice Date?"


print("Processing...\n")
output = rag_chain.invoke(question)
print("Answer:",output,"\n")

Processing...

Answer: The Invoice Date is January 25, 2016. 



In [28]:
question = "Is there any due amount there?"


print("Processing...\n")
output = rag_chain.invoke(question)
print("Answer:",output,"\n")

Processing...

Answer: There is no due amount listed in the provided context. The total due on the invoice is $93.50, but it does not specify a specific payment date or deadline for payment. 



# *Thank You*

### Project by *Sawan Kumar*